## Data Processing

### Import Libraries

In [21]:
import pandas as pd
import numpy as np
import ast
import sklearn
from tqdm.notebook import tqdm
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import casual_tokenize
import requests

Code to change progress bar color to dark from tqdm.

In [17]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  

In [3]:
df = pd.DataFrame(pd.read_csv('../data/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None))
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.shape

(1600000, 6)

In [6]:
df.dtypes

sentiment     int64
id            int64
date         object
query        object
user         object
text         object
dtype: object

### Data Cleaning

In [7]:
df.isnull().sum()

sentiment    0
id           0
date         0
query        0
user         0
text         0
dtype: int64

In [14]:
df['date'] = pd.to_datetime(df['date'])
df.head()

  df['date'] = pd.to_datetime(df['date'])


Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Perfect

### Text Processing

In [28]:
tqdm.pandas()
nltk.download('punkt', download_dir='./nltk_data', quiet=True)
nltk.download('punkt_tab', download_dir='./nltk_data', quiet=True)
nltk.download('wordnet', download_dir='./nltk_data', quiet=True)
nltk.data.path.append('./nltk_data')
stemmer = PorterStemmer()
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines())

In [29]:
df['tokenized_text'] = df['text'].progress_apply(lambda x: casual_tokenize(x.lower(), reduce_len=True, strip_handles=False))
df.head()

  0%|          | 0/1600000 [00:00<?, ?it/s]

Unnamed: 0,sentiment,id,date,query,user,text,tokenized_text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@switchfoot, http://twitpic.com/2y1zl, -, aww..."
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face..."
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@kenichan, i, dived, many, times, for, the, b..."
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@nationwideclass, no, ,, it's, not, behaving,..."


In [30]:
df['alphanum_text'] = df['tokenized_text'].progress_apply(lambda x: [word for word in x if word.isalnum()])
df.head()

  0%|          | 0/1600000 [00:00<?, ?it/s]

Unnamed: 0,sentiment,id,date,query,user,text,tokenized_text,alphanum_text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@switchfoot, http://twitpic.com/2y1zl, -, aww...","[awww, a, bummer, you, shoulda, got, david, ca..."
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face...","[is, upset, that, he, update, his, facebook, b..."
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@kenichan, i, dived, many, times, for, the, b...","[i, dived, many, times, for, the, ball, manage..."
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@nationwideclass, no, ,, it's, not, behaving,...","[no, not, behaving, at, all, mad, why, am, i, ..."


In [31]:
df['stopword_removed_text'] = df['alphanum_text'].progress_apply(lambda x: [word for word in x if word not in stopwords])
df.head()

  0%|          | 0/1600000 [00:00<?, ?it/s]

Unnamed: 0,sentiment,id,date,query,user,text,tokenized_text,alphanum_text,stopword_removed_text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@switchfoot, http://twitpic.com/2y1zl, -, aww...","[awww, a, bummer, you, shoulda, got, david, ca...","[awww, bummer, shoulda, david, carr, day]"
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face...","[is, upset, that, he, update, his, facebook, b...","[upset, update, facebook, texting, result, sch..."
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@kenichan, i, dived, many, times, for, the, b...","[i, dived, many, times, for, the, ball, manage...","[dived, times, ball, managed, save, 50, rest, ..."
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its...","[body, feels, itchy]"
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@nationwideclass, no, ,, it's, not, behaving,...","[no, not, behaving, at, all, mad, why, am, i, ...","[behaving, mad]"


In [32]:
df['stemmed_text'] = df['stopword_removed_text'].progress_apply(lambda x: [stemmer.stem(word.lower()) for word in x])
df.head()

  0%|          | 0/1600000 [00:00<?, ?it/s]

Unnamed: 0,sentiment,id,date,query,user,text,tokenized_text,alphanum_text,stopword_removed_text,stemmed_text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@switchfoot, http://twitpic.com/2y1zl, -, aww...","[awww, a, bummer, you, shoulda, got, david, ca...","[awww, bummer, shoulda, david, carr, day]","[awww, bummer, shoulda, david, carr, day]"
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face...","[is, upset, that, he, update, his, facebook, b...","[upset, update, facebook, texting, result, sch...","[upset, updat, facebook, text, result, school,..."
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@kenichan, i, dived, many, times, for, the, b...","[i, dived, many, times, for, the, ball, manage...","[dived, times, ball, managed, save, 50, rest, ...","[dive, time, ball, manag, save, 50, rest, bound]"
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its...","[body, feels, itchy]","[bodi, feel, itchi]"
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@nationwideclass, no, ,, it's, not, behaving,...","[no, not, behaving, at, all, mad, why, am, i, ...","[behaving, mad]","[behav, mad]"


In [36]:
df2 = df[['date', 'stemmed_text']]
df2.to_csv('../data/1.6_million_dataset_cleaned.csv', index=False)