In [8]:
import pandas as pd
import nltk

In [11]:
df=pd.read_csv('sentiment_nlp (1).csv')
df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [12]:
data=df[['text', 'target']]
data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
data.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Data Cleaning

In [14]:
data.dtypes

text      object
target     int64
dtype: object

## Feature Generation

## BoW Bag of Words Model

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [19]:
#tokenizer to remove unwanted elements from out data like symbols and numbers
tk = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = tk.tokenize)
text_counts= cv.fit_transform(data['text'])
text_counts

<7613x21295 sparse matrix of type '<class 'numpy.int64'>'
	with 80841 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train, y_test=train_test_split(text_counts,data.target, test_size=0.2)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [22]:
lg=LogisticRegression()

In [23]:
lg.fit(x_train,y_train)

LogisticRegression()

In [24]:
y_pred=lg.predict(x_test)

In [25]:
accuracy_score(y_test,y_pred)

0.7977675640183848

In [26]:
recall_score(y_test,y_pred)

0.7004608294930875

In [27]:
f1_score(y_test,y_pred)

0.7475409836065573

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
nb=MultinomialNB()

In [19]:
nb.fit(x_train,y_train)

MultinomialNB()

In [20]:
ypred=nb.predict(x_test)
ypred

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [21]:
accuracy_score(y_test,ypred)

0.7905449770190414

In [22]:
recall_score(y_test,ypred)

0.7740916271721959

### Random forest, Adaboost, Gradient Boost and extra tree classifier

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tfd=TfidfVectorizer()
tfdata=tfd.fit_transform(data['text'])

In [91]:
x_train,x_test,y_train, y_test=train_test_split(tfdata,data.target, test_size=0.1)

In [92]:
nb=MultinomialNB()
nb.fit(x_train,y_train)

MultinomialNB()

In [93]:
npred=nb.predict(x_test)


In [94]:
accuracy_score(y_test,npred)

0.8162729658792651

In [95]:
recall_score(y_test,npred)

0.6572327044025157

In [96]:
precision_score

<function sklearn.metrics._classification.precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')>

In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [98]:
rf=RandomForestClassifier(random_state=2)

In [99]:
rf.fit(x_train,y_train)


RandomForestClassifier(random_state=2)

In [100]:
rpred=rf.predict(x_test)

In [101]:
recall_score(y_test,rpred)

0.6132075471698113

#### Gridsearch Poor results

In [102]:
max_depth=[4, 5, 6]
n_estimators = [100, 200, 300]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

# Build the grid search
dfrst = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,random_state=42)
grid = GridSearchCV(estimator=dfrst, param_grid=param_grid, cv = 5)
grid_results = grid.fit(x_train, y_train)


In [103]:
grid.best_estimator_

RandomForestClassifier(max_depth=6, random_state=42)

In [104]:
grf=RandomForestClassifier(max_depth=16, n_estimators=64, random_state=2)
grf.fit(x_train,y_train)


RandomForestClassifier(max_depth=16, n_estimators=64, random_state=2)

In [105]:
grpred=grf.predict(x_test)

In [106]:
recall_score(y_test,grpred)

0.29874213836477986

### Gridsearch 2

In [63]:
rfc=RandomForestClassifier(random_state=42)

In [64]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [65]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [66]:
CV_rfc.best_params_


{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 500}

In [68]:
CV_rfc.best_estimator_

RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42)

In [75]:
rfc1=RandomForestClassifier(max_depth=4, n_estimators=100, random_state=2, criterion='gini')

In [76]:
rfc1.fit(x_train, y_train)

RandomForestClassifier(max_depth=4, random_state=2)

In [77]:
rpr=rfc1.predict(x_test)
recall_score(y_test,rpr)

0.040419161676646706

## DecisionTreeClassifier

In [90]:
from sklearn.tree import DecisionTreeClassifier

In [108]:
param_dist = {"max_depth": [2,3,4,5, 6], "max_features": [1,3,4,5,7,8,9], "min_samples_leaf": [1,3,4,5,6,7],"criterion": ["gini", "entropy"]}

In [110]:
dt = DecisionTreeClassifier()
tree_cv = GridSearchCV(dt, param_dist, cv = 10)
tree_cv.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6],
                         'max_features': [1, 3, 4, 5, 7, 8, 9],
                         'min_samples_leaf': [1, 3, 4, 5, 6, 7]})

In [113]:
tpred=tree_cv.predict(x_test)
recall_score(y_test,tpred)

0.018867924528301886

In [111]:
tree_cv.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=6, max_features=9,
                       min_samples_leaf=6)

In [117]:
dt=DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier()

In [118]:
tpred=dt.predict(x_test)
recall_score(y_test,tpred)

0.6446540880503144

## Use GridSearch to check  the perfomance of AdaBoost, Gradient Boost and XGboost before and after hyperameter tuning