In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score

In [2]:
## To identify hatespeech in the tweets ##
## Dataset obtained from Analytics Vidhya practice problem ##

#train = pd.read_csv("~/Documents/Study/studypython/twitter_data/train_E6oV3lV.csv")
#test = pd.read_csv("~/Documents/Study/studypython/twitter_data/test_tweets_anuFYb8.csv")


In [3]:
## Sentiment140 twitter data from Stanford: http://help.sentiment140.com/for-students/

cols = ['sentiment','id','date','query_string','user','text']

train = pd.read_csv("~/Documents/Study/studypython/Sentiment140/trainingandtestdata/training.1600000.processed.noemoticon.csv", encoding = "latin-1", header=None, names=cols)
test = pd.read_csv("~/Documents/Study/studypython/Sentiment140/trainingandtestdata/testdata.manual.2009.06.14.csv", encoding = "latin-1", header=None, names=cols)



In [4]:
train['sentiment'].value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [5]:
train.head()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
test.head()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [7]:
## Drop unwanted attributes/dimensions:
# in this case: id, date, query_string, user attributes are not necessary

train.drop(['id','date','query_string','user'],axis=1,inplace=True)
test.drop(['id','date','query_string','user'],axis=1,inplace=True)


In [8]:
## training set has only 0 and 4 polarity (positive and negative sentiments. there are no
## traiing data for the neutral class) But the test set has instances belonging to all 3 classes - 0, 2 and 4
## therefore we are removing those test instances with polarity 2 (neutral class)

test = test[test['sentiment'] != 2]

In [9]:
test['sentiment'].value_counts()

4    182
0    177
Name: sentiment, dtype: int64

In [10]:
## Map the sentiment polarity values from {0,4} to {0,1} in both train and test sets:

train['sentiment'] = train['sentiment'].map({0: 0, 4: 1})
test['sentiment'] = test['sentiment'].map({0: 0, 4: 1})

In [11]:
## Contraction Handling:

# Mapping Contractions to their expanded forms:

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" } 

# converting all apostrophes to single quotes:
def apos_handling(text):
    return re.sub("’", "'", text)

def contraction_handling(text):
    return ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split()])

In [12]:
## Remove HTML encoding:
def remove_html_enc(text):
    soup = BeautifulSoup(text, 'lxml')
    return soup.get_text()

## Remove the twitter user handles, hashtags (only the symbol #) and url links (http and www)
pattern1 = re.compile(r"(@[A-Za-z0-9_]*)|(#)|(www.[^ ]+)|(https?://[^ ]+)")

def clean_tweet(text):
    return pattern1.sub("", text)


## Remove the non-ascii characters (that represent the emoticons)
def remove_nonascii(text):
    return ''.join([i if ord(i) < 128 else '' for i in text])


## Remove UTF-BOM (Byte Order Mark) characters
def remove_utfbom(text):
    try:
        clean = text.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = text
    return clean


## Remove short words whose length is <= n
def remove_short_words(text):
    n = 1
    words = word_tokenize(text)
    return ' '.join([w for w in words if len(w) > n])


## Remove non-letter characters: numbers, punctuations
pattern2 = re.compile("([^A-Za-z\s]+)")

def remove_nonletter(text):
    return pattern2.sub("", text)

## Negation Handling: (subsumed in contraction handling above)

negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
negation_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def neg_handling(text):
    return negation_pattern.sub(lambda x : negations_dic[x.group()], text)


## Simple Spell Correction: a character that is repeated 2 or more times is shortened to 2 repetitions
def simple_spellcorrect(text):
    return re.sub(r'(.)\1+', r'\1\1', text)



In [13]:

train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

train['text'] = train['text'].apply(remove_html_enc)
test['text'] = test['text'].apply(remove_html_enc)

train['text'] = train['text'].apply(clean_tweet)
test['text'] = test['text'].apply(clean_tweet)

train['text'] = train['text'].apply(remove_utfbom)
test['text'] = test['text'].apply(remove_utfbom)

train['text'] = train['text'].apply(apos_handling)
test['text'] = test['text'].apply(apos_handling)

train['text'] = train['text'].apply(contraction_handling)
test['text'] = test['text'].apply(contraction_handling)

#train['text'] = train['text'].apply(neg_handling)
#test['text'] = test['text'].apply(neg_handling)

train['text'] = train['text'].apply(remove_nonletter)
test['text'] = test['text'].apply(remove_nonletter)

#train['text'] = train['text'].apply(remove_nonascii)
#test['text'] = test['text'].apply(remove_nonascii)

train['text'] = train['text'].apply(remove_short_words)
test['text'] = test['text'].apply(remove_short_words)

train['text'] = train['text'].apply(simple_spellcorrect)
test['text'] = test['text'].apply(simple_spellcorrect)



In [14]:
train.head()

Unnamed: 0,sentiment,text
0,0,aww that is bummer you shoulda got david carr ...
1,0,is upset that he can not update his facebook b...
2,0,dived many times for the ball managed to save ...
3,0,my whole body feels itchy and like its on fire
4,0,no it is not behaving at all am mad why am her...


In [15]:
test.head()

Unnamed: 0,sentiment,text
0,1,loovvee my kindle not that the dx is cool but ...
1,1,reading my kindle love it lee childs is good read
2,1,ok first assesment of the kindle it fucking rocks
3,1,you will love your kindle have had mine for fe...
4,1,fair enough but have the kindle and think it i...


In [23]:
## Remove stop words:
'''
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    words_wo_stop = [w for w in words if w not in stop_words]
    return ' '.join(words_wo_stop)

train['text'] = train['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_stopwords)
'''

In [21]:
## Stemming:
'''
porter_stemmer = PorterStemmer()

def get_stems(text):
    words = word_tokenize(text)
    return ' '.join([porter_stemmer.stem(w) for w in words])

train['text'] = train['text'].apply(get_stems)
test['text'] = test['text'].apply(get_stems)
'''

In [16]:
## Lemmatization: does it make any difference because of lack of proper POS structure in the tweets??

# map the pos_tags to wordnet pos tags

tagmap = defaultdict(lambda : wordnet.NOUN)
tagmap['J'] = wordnet.ADJ
tagmap['V'] = wordnet.VERB
tagmap['R'] = wordnet.ADV

lemmatizer = WordNetLemmatizer()

def get_lemmas(text):
    words = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(w, pos = tagmap[t[0]]) for w, t in (nltk.pos_tag(words))])

train['text'] = train['text'].apply(get_lemmas)
test['text'] = test['text'].apply(get_lemmas)


In [87]:
## Vectorization - Feature Extraction
'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

train_counts = count_vect.fit_transform(train['text'])

train_tfidf = tfidf_transformer.fit_transform(train_counts)

test_counts = count_vect.transform(test['text'])

test_tfidf = tfidf_transformer.transform(test_counts)
'''
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english', ngram_range=(1,3))
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test['text'])



In [88]:
train_tfidf.shape

(1600000, 9421254)

In [89]:
## Build Classifiers: Naive Bayes, SVM, Logistic Regression, Ensemble

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier

train_target = train['sentiment']

# Train the classifier

# 1. Naive Bayes:
cl1 = MultinomialNB()
cl1.fit(train_tfidf, train_target)

# 2. Logistic Regression:
cl2 = LogisticRegression()
cl2.fit(train_tfidf, train_target)

# 3. SVM:
cl3 = SGDClassifier(loss='hinge', penalty = 'l2', alpha=1e-3, max_iter=5, random_state=42)
cl3.fit(train_tfidf, train_target)

# 4. Ensemble:
estimators = []
estimators.append(('naivebayes', cl1))
estimators.append(('logistic', cl2))
estimators.append(('svm', cl3))
ensemble = VotingClassifier(estimators)
ensemble.fit(train_tfidf, train_target)




VotingClassifier(estimators=[('naivebayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solve...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [90]:
from sklearn.metrics import f1_score

# Predict the test data labels:

pr1 = cl1.predict(test_tfidf)
pr2 = cl2.predict(test_tfidf)
pr3 = cl3.predict(test_tfidf)
pr4 = ensemble.predict(test_tfidf)



  if diff:


In [91]:
# Find the accuracy of the various classifiers

test_target = test['sentiment']

print("Naive Bayes:", accuracy_score(test_target, pr1))
print("Logistic Regression:", accuracy_score(test_target, pr2))
print("SVM:", accuracy_score(test_target, pr3))
print("Ensemble:", accuracy_score(test_target, pr4))


Naive Bayes: 0.7994428969359332
Logistic Regression: 0.8161559888579387
SVM: 0.7632311977715878
Ensemble: 0.8189415041782729


In [98]:
#test['sentiment'].unique()

test['sentiment'].value_counts()

4    182
0    177
2    139
Name: sentiment, dtype: int64

In [26]:
idcol = test["id"]
result1 = pd.DataFrame({"id": idcol, "label": pr1})
result1.to_csv("~/Documents/Study/studypython/twitter_data/nb.csv", sep = ",", index=False, columns=["id", "sentiment"])

result2 = pd.DataFrame({"id": idcol, "label": pr2})
result2.to_csv("~/Documents/Study/studypython/twitter_data/lr.csv", sep = ",", index=False, columns=["id", "sentiment"])

result3 = pd.DataFrame({"id": idcol, "label": pr3})
result3.to_csv("~/Documents/Study/studypython/twitter_data/svm.csv", sep = ",", index=False, columns=["id", "sentiment"])

result4 = pd.DataFrame({"id": idcol, "label": pr4})
result4.to_csv("~/Documents/Study/studypython/twitter_data/ensemble.csv", sep = ",", index=False, columns=["id", "sentiment"])



# Unigram All the 363979 features: 
Naive Bayes: 0.7715877437325905
Logistic Regression: 0.7938718662952646
SVM: 0.7437325905292479
Ensemble: 0.7994428969359332

# Unigram 50000 features:
Naive Bayes: 0.7827298050139275
Logistic Regression: 0.7883008356545961
SVM: 0.7520891364902507
Ensemble: 0.7994428969359332

# Unigram 10000 features:
Naive Bayes: 0.7994428969359332
Logistic Regression: 0.7994428969359332
SVM: 0.766016713091922
Ensemble: 0.8161559888579387

# Unigram 5000 features:
Naive Bayes: 0.8133704735376045
Logistic Regression: 0.8133704735376045
SVM: 0.7715877437325905
Ensemble: 0.8217270194986073

# Unigram 2500 features:
Naive Bayes: 0.7910863509749304
Logistic Regression: 0.7938718662952646
SVM: 0.766016713091922
Ensemble: 0.7938718662952646

# Unigram 1000 features:
Naive Bayes: 0.7827298050139275
Logistic Regression: 0.7771587743732591
SVM: 0.7520891364902507
Ensemble: 0.7771587743732591

-------------------------------------------------------------------

# Bigram All 3794859 features:
Naive Bayes: 0.7910863509749304
Logistic Regression: 0.8245125348189415
SVM: 0.7715877437325905
Ensemble: 0.8189415041782729

# Bigram 20000 features:
Naive Bayes: 0.7855153203342619
Logistic Regression: 0.8105849582172702
SVM: 0.7632311977715878
Ensemble: 0.8022284122562674

# Bigram 10000 features:
Naive Bayes: 0.7910863509749304
Logistic Regression: 0.8161559888579387
SVM: 0.7632311977715878
Ensemble: 0.8133704735376045

# Bigram 5000 features:
Naive Bayes: 0.7883008356545961
Logistic Regression: 0.807799442896936
SVM: 0.766016713091922
Ensemble: 0.7966573816155988

-----------------------------------------------------------------

# Trigram All 9421254 features:
Naive Bayes: 0.7994428969359332
Logistic Regression: 0.8161559888579387
SVM: 0.7632311977715878
Ensemble: 0.8189415041782729

# Trigram 20000 features:
Naive Bayes: 0.7855153203342619
Logistic Regression: 0.8105849582172702
SVM: 0.766016713091922
Ensemble: 0.7994428969359332

# Trigram 10000 features:
Naive Bayes: 0.7883008356545961
Logistic Regression: 0.8161559888579387
SVM: 0.7632311977715878
Ensemble: 0.8105849582172702

# Trigram 5000 features:
Naive Bayes: 0.7855153203342619
Logistic Regression: 0.807799442896936
SVM: 0.766016713091922
Ensemble: 0.7938718662952646
