In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

### 0. Importing data

In [13]:
train_df = pd.read_csv("../datasets/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("../datasets/tweet-sentiment-extraction/test.csv")

### 1. Data Exploration

In [14]:
# Understanding the data at high level

In [15]:
test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [16]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [17]:
train_df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [18]:
train_df["text"].describe()

count                                    27480
unique                                   27480
top        I`d have responded, if I were going
freq                                         1
Name: text, dtype: object

In [19]:
train_df.shape

(27481, 4)

In [20]:
## Data imbalance check
train_df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [21]:
## check text column null row, 
train_df["text"].isnull().values.any()

True

In [22]:
train_df["text"].isnull().sum()

1

In [23]:
# index
train_df[train_df["text"].isnull()].index

Int64Index([314], dtype='int64')

In [24]:
train_df.iloc[314, :]
# make sense to keep this data 

textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 314, dtype: object

In [25]:
train_df["text"].fillna(value="", inplace=True)
train_df["selected_text"].fillna(value="", inplace=True)

In [26]:
train_df.iloc[314, :]

textID           fdb77c3752
text                       
selected_text              
sentiment           neutral
Name: 314, dtype: object

In [27]:
## adding new column with additional info
#train_df["text_len"] = train_df["text"].apply(len)

In [28]:
## does text_len have any correlation with the target? if yes, would be valuable to include as a feature


In [29]:
#label_encode = {"positive": 1, "neutral": 0, "negative": -1}
#train_df["sentiment"] = train_df.sentiment.apply(lambda row: label_encode[row])

In [30]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [31]:
corr = train_df.corr()

In [32]:
corr

In [33]:
# No correlation with text_len

In [34]:
jjkjkjk

NameError: name 'jjkjkjk' is not defined

### 3. Data Preprocessing

In [35]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [36]:
# train_df["text_split"]=train_df.text.apply(lambda row: str(row).split())
# train_df["selected_text_split"]=train_df.selected_text.apply(lambda row: str(row).split())

In [37]:
#train_df["text_len"] = train_df["text"].apply(len)

In [38]:
train_df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [39]:
train_df.shape, test_df.shape

((27481, 4), (3534, 3))

In [40]:
# check duplicates 
train_df.duplicated().sum(), test_df.duplicated().sum()

(0, 0)

In [41]:
# uniformity : convert to lower case 
train_df['text'] = train_df['text'].apply(lambda x : str(x).lower())
train_df['selected_text'] = train_df['selected_text'].apply(lambda x : str(x).lower())

In [42]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment
15059,cc0945a961,alredy had my chocolate it is impossible to ...,it is impossible to resist ;),positive
17547,a231d7864b,had a good day selling at feria urbana. the la...,good day se,positive
8460,e2abcae593,smh @ playing dress up! lol. i can`t see the ...,smh,negative
8052,7c17aea36e,..ok brother...did you change your num and no...,.you no good dude,negative
18536,6af2e23e1c,sat in the pub. pretty quiet so far. prob leav...,pretty quiet so far. pr,negative
2240,978068a4d7,watching supernatural those boys can hunt me ...,watching supernatural those boys can hunt me ...,neutral
7158,bac11ddc14,i know. it sucks,. it sucks,negative
25654,937d903511,okayyy you can read it to me cause then i`ll ...,special,positive
20816,4c1726ce91,including myself... guess umma be partying al...,including myself... guess umma be partying alo...,neutral
13962,4cfbcced8f,"yer, oh that`s **** cause u hell need to post...","yer, oh that`s **** cause u hell need to post ...",neutral


In [43]:
from sklearn.model_selection import train_test_split
x_train, x_val , y_train , y_val = train_test_split(train_df[['text','sentiment']],train_df['selected_text'],test_size=0.2, random_state=42)
x_train.shape, x_val.shape , y_train.shape , y_val.shape


((21984, 2), (5497, 2), (21984,), (5497,))

In [44]:
from transformers import RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
## Converting strings to a sequence of ids (integer), using the tokenizer and vocabulary.


In [46]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base',add_prefix_space=True)

In [47]:
## Tokenize and prepare for the model a sequence or a pair of sequences.

In [48]:
max_len=128
count = x_train.shape[0]
input_ids = np.zeros((count,max_len),dtype='int32')
attention_mask = np.zeros((count,max_len),dtype='int32')


In [49]:
print(x_train['text'].values[0])

doctor who has finished


In [50]:
from tqdm import tqdm
for i,each in tqdm(enumerate(x_train.values)):
  
  val = tokenizer.encode_plus(each[0],each[1],add_special_tokens=True,max_length=128,return_attention_mask=True,pad_to_max_length=True,return_tensors='pt',verbose=False)
  input_ids[i] = val['input_ids']
  attention_mask[i] = val['attention_mask']

21984it [00:09, 2310.20it/s]


In [82]:
x_train

Unnamed: 0,text,sentiment
11293,doctor who has finished,0
11299,you should.,0
18204,"back at school again. almost weekend. oh wait,...",0
22728,my computer is so slooowww this morning. i th...,0
1231,on my way to dazzle bar!!,0
...,...,...
21575,star trek was pure awesome! love it!!! <3333 ...,1
5390,"will be going to indiana baptist sunday, pray ...",0
860,is sitting thru the boring bits in titanic wai...,0
15795,missed the play,-1
