In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Plotting labelled data
from nltk.corpus import stopwords # dealing with stop words
from textblob import TextBlob # dealing with spelling correction
from textblob import Word # dealing with lemmatization
from sklearn.feature_extraction.text import TfidfVectorizer # leading with term frequency

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/train.tsv', sep="\t")
test = pd.read_csv('../input/test.tsv', sep="\t")
submission = pd.read_csv('../input/sampleSubmission.csv', sep="\t")

In [None]:
train.shape

In [None]:
test.shape

**Text data pre processing **

Lower case

In [None]:
train['Phrase'] = train['Phrase'].apply(lambda x: " ".join(x.lower() for x in x.split()))

Removing Punctuation

In [None]:
train['Phrase'] = train['Phrase'].str.replace('[^\w\s]','')

Removal of Stop Words

In [None]:
stop = stopwords.words('english')
train['Phrase'] = train['Phrase'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

Tokenization
Tokenization refers to dividing the text into a sequence of words or sentences. In this example, we have used the textblob library to first transform phrases into a blob and then converted them into a series of words.

In [None]:
TextBlob(train['Phrase'][1]).words
TextBlob(test['Phrase'][1]).words

Lemmatization
Lemmatization is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices. It makes use of the vocabulary and does a morphological analysis to obtain the root word. Therefore, we usually prefer using lemmatization over stemming.

In [None]:
train['Phrase'] = train['Phrase'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [None]:
test['Phrase'] = test['Phrase'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

Inverse Document Frequency
The intuition behind inverse document frequency (IDF) is that a word is not of much use to us if it’s appearing in all the documents.

Therefore, the IDF of each word is the log of the ratio of the total number of rows to the number of rows in which that word is present.

IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present.

TfidfVectorizer can lowercase letters, disregard punctuation and stopwords.

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,2))
train_vect = tfidf.fit_transform(train['Phrase'])
test_vect = tfidf.transform(test['Phrase'])

In [None]:
train_vect.shape

In [None]:
test_vect.shape

In [None]:
#train_features = hstack([train_vect])
#test_features = hstack([test_vect])

In [None]:
le=LabelEncoder()
y=le.fit_transform(train.Sentiment.values)
#y = pd.get_dummies(train.Sentiment)
#y.head()

**Model selection**

* Logistic Regression
* Linear Support Vector Machine
* Multinomial Naive Bayes

In [None]:
lsv = LinearSVC()
nb = MultinomialNB()
lr = LogisticRegression(random_state=0)

In [None]:
X_train_vect, X_test_vect, y_train_vect, y_test_vect = train_test_split(train_vect, y, train_size=0.75)
nb.fit(X_train_vect, y_train_vect)
predictions_nb = nb.predict(X_test_vect)
accuracy = accuracy_score(y_test_vect, predictions_nb)
print(accuracy)

In [None]:
lr.fit(X_train_vect, y_train_vect)
predictions_lr = lr.predict(X_test_vect)
accuracy = accuracy_score(y_test_vect, predictions_lr)
print(accuracy)

In [None]:
lsv.fit(X_train_vect, y_train_vect)
predictions_lsv = lsv.predict(X_test_vect)
accuracy = accuracy_score(y_test_vect, predictions_lsv)
print(accuracy)

**Predict and submission**

In [None]:
lsv.fit(train_vect, y)
predictions_lsv = lsv.predict(test_vect)

In [None]:
test['Sentiment'] = predictions_lsv
submission = test[["PhraseId","Sentiment"]]
submission.to_csv("Finalsubmission.csv", index = False)

In [None]:
submission.head()