In [1]:
import requests
from urllib.request import urlretrieve
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk


In [2]:
#step 3.1 - load data from the remote source
urlretrieve(url="https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv",
                       filename="dataset.csv")

('dataset.csv', <http.client.HTTPMessage at 0x7c86901786b0>)

In [3]:
dataset = pd.read_csv("../data/dataset.csv")

In [4]:
#inspecting dataset:
dataset.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [5]:
#remove tweet_id col:
dataset = dataset.drop(columns="tweet_id")
dataset

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,neutral,@JohnLloydTaylor
39996,love,Happy Mothers Day All my love
39997,love,Happy Mother's Day to all the mommies out ther...
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [6]:
# check classes:
dataset['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [7]:
#check number of samples for each class:
dataset['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [8]:
classes = ["sadness","happiness"]

In [9]:
modified_dataset = dataset.loc[dataset['sentiment'].isin(classes)]

In [10]:
#verify:
modified_dataset

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?
...,...,...
39986,happiness,going to watch boy in the striped pj's hope i ...
39987,happiness,"gave the bikes a thorough wash, degrease it an..."
39988,happiness,"had SUCH and AMAZING time last night, McFly we..."
39994,happiness,Succesfully following Tayla!!


In [11]:
#map classes (1 for happiness - 0 for sadness)
modified_dataset['sentiment'] = modified_dataset.loc[:,'sentiment'].apply(lambda x: classes.index(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  modified_dataset['sentiment'] = modified_dataset.loc[:,'sentiment'].apply(lambda x: classes.index(x))


In [12]:
modified_dataset

Unnamed: 0,sentiment,content
1,0,Layin n bed with a headache ughhhh...waitin o...
2,0,Funeral ceremony...gloomy friday...
6,0,"I should be sleep, but im not! thinking about ..."
8,0,@charviray Charlene my love. I miss you
9,0,@kelcouch I'm sorry at least it's Friday?
...,...,...
39986,1,going to watch boy in the striped pj's hope i ...
39987,1,"gave the bikes a thorough wash, degrease it an..."
39988,1,"had SUCH and AMAZING time last night, McFly we..."
39994,1,Succesfully following Tayla!!


In [13]:
train_set,valid_set = train_test_split(modified_dataset,test_size=0.3)
valid_set,test_set = train_test_split(valid_set,test_size=0.5)

In [14]:
#reset indices positioning:
datasets = {
    "train":train_set,
    "valid":valid_set,
    "test":test_set
}

In [15]:
for set in datasets:
    print(f"number of samples in {set}_set is: {len(datasets[set])}")

number of samples in train_set is: 7261
number of samples in valid_set is: 1556
number of samples in test_set is: 1557


In [16]:
datasets

{'train':        sentiment                                            content
 33333          1  @Scorch_Mom Emailed you - it'll be on the porc...
 27170          0  Really tired, and need to be up in the morning...
 37175          1                Happy Birthday to @AndrewGirdwood !
 22243          1  i reckon i could live of yogurt for a week if ...
 9039           0  its impossible to watch in full ANY flash vide...
 ...          ...                                                ...
 33541          1                          *[ It's time for TSCC!  ]
 3750           0  Want 2 transfer my Naini trip pictures 2my com...
 13910          0  Omg what is up with parents today !  I've been...
 4904           0  watching chicago. @bambamonline honeyyyy im wi...
 30777          1     @geramie yeah I was thinking about that ,ahaha
 
 [7261 rows x 2 columns],
 'valid':        sentiment                                            content
 22711          1  @ComedyQueen Well.... you're not calle

In [17]:
for set in datasets:
    datasets[set] = datasets[set].reset_index(drop=True)

In [20]:
# last step in data ingestion:
for set in datasets:
    datasets[set].to_csv(f"../data/{set}_set.csv",index=False)