## Predicting sentiment from Reviews

In [None]:
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, auc ,roc_curve
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model,ensemble, metrics, model_selection,naive_bayes
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline

import nltk
from nltk.corpus import stopwords
import string

eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None

### Read data- Loading  data. 

In [None]:
path = r"train.csv"
data = pd.read_csv(path, error_bad_lines=False)

### Let's explore this data together, Data includes the sentiment, the sentiment sourrce and the Sentiment text. 

In [None]:
data.head()

### event counts

In [None]:
data.groupby('Is_Response').count()

### Drop missing rows

In [None]:
data.dropna(inplace = True)


### Train test split

In [None]:
train_df, test_df = train_test_split(data, test_size=0.3)

## Feature Engineering

In [None]:
## Number of words in the text ##
train_df["num_words"] = train_df["Description"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["Description"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["Description"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["Description"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train_df["num_chars"] = train_df["Description"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["Description"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
train_df["num_stopwords"] = train_df["Description"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test_df["num_stopwords"] = test_df["Description"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the text ##
train_df["num_punctuations"] =train_df['Description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['Description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train_df["num_words_upper"] = train_df["Description"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["Description"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
train_df["num_words_title"] = train_df["Description"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["Description"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train_df["mean_word_len"] = train_df["Description"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["Description"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
train_df.head()

### Convert Categorical variables to dummies

In [None]:
Browser_Used = pd.get_dummies(train_df['Browser_Used'],drop_first=True)
Device_Used = pd.get_dummies(train_df['Device_Used'],drop_first=True)

train_df = pd.concat([train_df,Browser_Used,Device_Used],axis=1)

Browser_Used = pd.get_dummies(test_df['Browser_Used'],drop_first=True)
Device_Used = pd.get_dummies(test_df['Device_Used'],drop_first=True)

test_df = pd.concat([test_df,Browser_Used,Device_Used],axis=1)

In [None]:
cols_to_drop = ['User_ID', 'Description','Browser_Used','Device_Used']
train_X = train_df.drop(cols_to_drop+['Is_Response'], axis=1)
test_X = test_df.drop(cols_to_drop + ['Is_Response'], axis=1)

In [None]:
mapping_dict = {'happy':0, 'not happy':1}
train_y = train_df['Is_Response'].map(mapping_dict)
test_y = test_df['Is_Response'].map(mapping_dict)

### Logistic Regression

In [None]:
%%time

logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(train_X, train_y)

In [None]:
%%time
predictions = logreg.predict(test_X)
proba= logreg.predict_proba(test_X)

In [None]:
print(accuracy_score(test_y,predictions))
print(confusion_matrix(test_y,predictions))
print(classification_report(test_y,predictions))

### XgBoost

In [None]:
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

In [None]:
model = xgb.XGBClassifier()

In [None]:
model.fit(train_X, train_y)

In [None]:
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, auc ,roc_curve
y_pred = model.predict(test_X) # Predict using our testdmat
y_pred

In [None]:
y_proba = model.predict_proba(test_X)
y_prob=y_proba[:,1]

In [None]:
print(accuracy_score(y_pred, test_y))
cm = confusion_matrix(test_y, y_pred)
print('confusion matrix\n %s' % cm)
print(classification_report(test_y,y_pred))

## BOW model

** Example of BOW and sparse Matrix **

In [None]:
train_set = ("The sky sky is blue green.", "The sun is bright.")
test_set = ("The sun in the sky is bright.", 
    "We can see the shining sun, the bright sun.")

vectorizer = CountVectorizer(stop_words='english')
document_term_matrix = vectorizer.fit_transform(train_set)
print (vectorizer.vocabulary_)
print (len(vectorizer.vocabulary_))
print(document_term_matrix)
print (document_term_matrix.getnnz())



In [None]:
%%time
### Fit transform the count vectorizer ###
count_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
count_vec.fit(train_df['Description'].values.tolist() + test_df['Description'].values.tolist())
train_count = count_vec.transform(train_df['Description'].values.tolist())
test_count  = count_vec.transform(test_df['Description'].values.tolist())

### Number of features 

In [None]:
train_count.getnnz()

In [None]:
test_count.getnnz()

### Train logistic model on train data

In [None]:
%%time

logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(train_count, train_y)

### Evaluate the sentiment model

In [None]:
%%time
predictions = logreg.predict(test_count)
proba= logreg.predict_proba(test_count)

In [None]:
print(accuracy_score(test_y,predictions))
print(confusion_matrix(test_y,predictions))
print(classification_report(test_y,predictions))

In [None]:
proba_test= logreg.predict_proba(test_count)
proba_train= logreg.predict_proba(train_count)

In [None]:
# add the predictions as new features #
train_df["lr_cvec_hap"] = proba_train[:,0]
train_df["lr_cvec_n_hap"] = proba_train[:,1]

test_df["lr_cvec_hap"] = proba_test[:,0]
test_df["lr_cvec_n_hap"] = proba_test[:,1]


### XgBoost

In [None]:
cols_to_drop = ['User_ID', 'Description','Browser_Used','Device_Used']
train_X = train_df.drop(cols_to_drop+['Is_Response'], axis=1)
test_X = test_df.drop(cols_to_drop + ['Is_Response'], axis=1)

In [None]:
train_X.columns

In [None]:
model = xgb.XGBClassifier()
model.fit(train_X, train_y)

In [None]:
y_pred = model.predict(test_X) # Predict using our testdmat
y_pred

In [None]:
print(accuracy_score(test_y,y_pred))
print(confusion_matrix(test_y,y_pred))
print(classification_report(test_y,y_pred))

# Test on dummy data 

In [None]:
dummy = 'testsenti.csv'
dumm = pd.read_csv(dummy,encoding='latin-1', error_bad_lines=False)
dumm

In [None]:
%%time
test_count = count_vec.transform(dumm['Description'].values.tolist())
predictions = logreg.predict(test_count)
proba= logreg.predict_proba(test_count)

In [None]:
dumm.shape

In [None]:
predictions.shape

In [None]:
dumm['pred'] = predictions
dumm['proba'] =proba[:,1]
dumm

## Unhappy

In [None]:
dumm.iloc[0,1]

In [None]:
dumm.iloc[16,1]

## Happy

In [None]:
dumm.iloc[1,1]