In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package wordnet to /home/danish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/danish/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df = pd.read_csv("dataset/data.txt", delimiter=',',encoding = "utf-8")
df.head()


In [4]:
df['text'] = [entry.lower() for entry in df['text']]
df['text']= [word_tokenize(entry) for entry in df['text']]

In [5]:
Corpus  = df
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    Corpus.loc[index,'text'] = str(Final_words)
Corpus.head()

Unnamed: 0,text,label
0,"['cozy', 'glam', 'aaliyah', 'inspire', 'ootd',...",0
1,"['guy', 'chubbygirlsdoitbetter', 'beyourself',...",1
2,"['beautiful', 'disaster']",0
3,"['part', 'mom']",0
4,"['depressed', 'piece', 'shit']",0


In [6]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text'],Corpus['label'],test_size=0.2)

In [7]:
Tfidf_vect = TfidfVectorizer(analyzer='word',max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf_word = Tfidf_vect.transform(Train_X)
Test_X_Tfidf_word = Tfidf_vect.transform(Test_X)

In [8]:
Tfidf_vect = TfidfVectorizer(analyzer='word',ngram_range=(2,3),max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf_ng = Tfidf_vect.transform(Train_X)
Test_X_Tfidf_ng = Tfidf_vect.transform(Test_X)

In [9]:
Tfidf_vect = TfidfVectorizer(analyzer='char',ngram_range=(2,3),max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf_chr = Tfidf_vect.transform(Train_X)
Test_X_Tfidf_chr = Tfidf_vect.transform(Test_X)

# SVM word,n-gram,char

In [10]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf_word,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf_word)
print("Accuracy -> ",accuracy_score( Test_Y,predictions_SVM)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_SVM, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_SVM, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_SVM, average="macro"))   

Accuracy ->  91.08333333333334
F1 Score ->  0.8595762132604239
Precision ->  0.8737120895495267
Recall ->  0.8473958927191416


In [11]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf_ng,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf_ng)
print("Accuracy -> ",accuracy_score( Test_Y,predictions_SVM)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_SVM, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_SVM, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_SVM, average="macro"))   

Accuracy ->  88.08333333333334
F1 Score ->  0.7948033591981547
Precision ->  0.8491799421926464
Recall ->  0.763248577907846


In [12]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf_chr,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf_chr)
print("Accuracy -> ",accuracy_score( Test_Y,predictions_SVM)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_SVM, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_SVM, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_SVM, average="macro"))   

Accuracy ->  90.91666666666667
F1 Score ->  0.8621124416112604
Precision ->  0.861578947368421
Recall ->  0.8626493355123965


# Random Forest word,char,n-gram

In [14]:
RFC = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
RFC.fit(Train_X_Tfidf_word,Train_Y)
predictions_RFC = RFC.predict(Test_X_Tfidf_word)
print("Accuracy -> ",accuracy_score( Test_Y,predictions_RFC)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_RFC, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_RFC, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_RFC, average="macro"))  

Accuracy ->  79.83333333333333
F1 Score ->  0.4745184895663692
Precision ->  0.8432689616568709
Recall ->  0.5155384946726971


In [13]:
RFC = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
RFC.fit(Train_X_Tfidf_chr,Train_Y)
predictions_RFC = RFC.predict(Test_X_Tfidf_chr)
print("Accuracy -> ",accuracy_score( Test_Y,predictions_RFC)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_RFC, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_RFC, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_RFC, average="macro"))  

Accuracy ->  82.58333333333333
F1 Score ->  0.5984572304115814
Precision ->  0.8693576388888888
Recall ->  0.5862503642329571


In [17]:
RFC = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
RFC.fit(Train_X_Tfidf_ng,Train_Y)
predictions_RFC = RFC.predict(Test_X_Tfidf_ng)
print("Accuracy -> ",accuracy_score( Test_Y,predictions_RFC)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_RFC, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_RFC, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_RFC, average="macro"))  

Accuracy ->  79.25
F1 Score ->  0.4421199442119944
Precision ->  0.39625
Recall ->  0.5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Logistic Regression

In [19]:
LR = LogisticRegression(random_state=0)
LR.fit(Train_X_Tfidf_word,Train_Y)
predictions_LR = LR.predict(Test_X_Tfidf_word)
print("Accuracy -> ",accuracy_score(Test_Y,predictions_LR)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_LR, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_LR, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_LR, average="macro"))  

Accuracy ->  89.5
F1 Score ->  0.8181818181818182
Precision ->  0.8802618791877653
Recall ->  0.7825624263615978


In [20]:
LR = LogisticRegression(random_state=0)
LR.fit(Train_X_Tfidf_chr,Train_Y)
predictions_LR = LR.predict(Test_X_Tfidf_chr)
print("Accuracy -> ",accuracy_score(Test_Y,predictions_LR)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_LR, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_LR, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_LR, average="macro"))  

Accuracy ->  89.66666666666666
F1 Score ->  0.8359607540513724
Precision ->  0.8525875974653239
Recall ->  0.8221529651729949


In [21]:
LR = LogisticRegression(random_state=0)
LR.fit(Train_X_Tfidf_ng,Train_Y)
predictions_LR = LR.predict(Test_X_Tfidf_ng)
print("Accuracy -> ",accuracy_score(Test_Y,predictions_LR)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_LR, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_LR, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_LR, average="macro"))  

Accuracy ->  85.08333333333333
F1 Score ->  0.6970273987094044
Precision ->  0.8571105072463768
Recall ->  0.6598317560462671


In [21]:
pip install "tensorflow>=1.7.0"

Note: you may need to restart the kernel to use updated packages.


In [25]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2",)

RuntimeError: variable_scope module_2/ was unused but the corresponding name_scope was already taken.

In [20]:
x = ["Nothing suits me like suit"] # Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"] 
embeddings.shape

TypeError: 'AutoTrackable' object is not callable