In [2]:
import pandas as pd
import numpy as np
import regex as re
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
from collections import Counter
from yellowbrick.text import PosTagVisualizer
import contractions
import os

In [3]:
#load the csv into a pandas dataframe 
df = pd.read_csv("tweets_dataset.csv", encoding = "latin-1")
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
#remove the column with no siginificance 
df.columns = ["Sentiment Level", "Index", "Date", "No_Query", "Username", "Tweet"]
df.drop("No_Query", inplace = True, axis = 1)

In [5]:
#seperate the columns into training labels and features 
df_text = df["Tweet"]
df_class = df["Sentiment Level"]

In [8]:
#concat the label and features columns into a new dataframe
df_concat = pd.concat([df_text, df_class], axis = 1)
df_concat


Unnamed: 0,Tweet,Sentiment Level
0,is upset that he can't update his Facebook by ...,0
1,@Kenichan I dived many times for the ball. Man...,0
2,my whole body feels itchy and like its on fire,0
3,"@nationwideclass no, it's not behaving at all....",0
4,@Kwesidei not the whole crew,0
...,...,...
1599994,Just woke up. Having no school is the best fee...,4
1599995,TheWDB.com - Very cool to hear old Walt interv...,4
1599996,Are you ready for your MoJo Makeover? Ask me f...,4
1599997,Happy 38th Birthday to my boo of alll time!!! ...,4


In [9]:
#removing usernames that starts with @ because usernames are of little significance in detecting depression in the texts
df_concat["Tweet"] = df_concat["Tweet"].str.replace("@[^\s]+", "", regex = True)



In [None]:
#removing punctuation 
df_concat["Tweet"] = df_concat["Tweet"].str.replace(r'[^\w\s]+', '', regex = True)

In [93]:
#encode the characters into ascii and ignore it if the character can't be encoded to ascii, therefore, special characters 
#such as emoticons, Chinese characters, greek characters etc. that are not in the ascii are removed during the encoding. 
#Then, decode the texts again to latin-1

df_concat["Tweet"] = df_concat["Tweet"].apply(lambda x : x.encode("ascii", "ignore").decode())

In [94]:
#extending the contractions, eg. I'm to I am 
df_concat['Tweet'] = df_concat['Tweet'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df_concat["Tweet"] = [' '.join(map(str,l)) for l in df_concat["Tweet"]]


In [53]:
#from nltk.corpus import stopwords 
#stop_words = set(stopwords.words('english'))

#stopwords
#df_concat["Tweet"] = df_concat["Tweet"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)])) 

In [95]:
#tokenize the words
from nltk.tokenize import word_tokenize
df_concat["Tweet_tokenize"] = df_concat["Tweet"].apply(word_tokenize)
df_concat["Tweet"] = ["".join(map(str,l)) for l in df_concat["Tweet"]]
df_concat["joined_Tweet"] = [' '.join(map(str,l)) for l in df_concat["Tweet_tokenize"]]
df_concat.head() 

Unnamed: 0,Tweet,Sentiment Level,Tweet_tokenize,joined_Tweet
0,is upset that he cannot update his Facebook by...,0,"[is, upset, that, he, can, not, update, his, F...",is upset that he can not update his Facebook b...
1,I dived many times for the ball Managed to sav...,0,"[I, dived, many, times, for, the, ball, Manage...",I dived many times for the ball Managed to sav...
2,my whole body feels itchy and like its on fire,0,"[my, whole, body, feels, itchy, and, like, its...",my whole body feels itchy and like its on fire
3,no its not behaving at all i am mad why am i h...,0,"[no, its, not, behaving, at, all, i, am, mad, ...",no its not behaving at all i am mad why am i h...
4,not the whole crew,0,"[not, the, whole, crew]",not the whole crew


In [96]:
df_concat

Unnamed: 0,Tweet,Sentiment Level,Tweet_tokenize,joined_Tweet
0,is upset that he cannot update his Facebook by...,0,"[is, upset, that, he, can, not, update, his, F...",is upset that he can not update his Facebook b...
1,I dived many times for the ball Managed to sav...,0,"[I, dived, many, times, for, the, ball, Manage...",I dived many times for the ball Managed to sav...
2,my whole body feels itchy and like its on fire,0,"[my, whole, body, feels, itchy, and, like, its...",my whole body feels itchy and like its on fire
3,no its not behaving at all i am mad why am i h...,0,"[no, its, not, behaving, at, all, i, am, mad, ...",no its not behaving at all i am mad why am i h...
4,not the whole crew,0,"[not, the, whole, crew]",not the whole crew
...,...,...,...,...
1599994,Just woke up Having no school is the best feel...,4,"[Just, woke, up, Having, no, school, is, the, ...",Just woke up Having no school is the best feel...
1599995,TheWDBcom Very cool to hear old Walt interview...,4,"[TheWDBcom, Very, cool, to, hear, old, Walt, i...",TheWDBcom Very cool to hear old Walt interview...
1599996,Are you ready for your MoJo Makeover Ask me fo...,4,"[Are, you, ready, for, your, MoJo, Makeover, A...",Are you ready for your MoJo Makeover Ask me fo...
1599997,Happy 38th Birthday to my boo of alll time Tup...,4,"[Happy, 38th, Birthday, to, my, boo, of, alll,...",Happy 38th Birthday to my boo of alll time Tup...


In [97]:
#save the data to csv 
df_concat.to_csv("L:\\ML-Assignment\\training_data.csv", index = False, columns = ["joined_Tweet", "Sentiment Level"])