# AdaBoost Modeling

In [110]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import export_text, DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

#For streamlit
import pickle

In [2]:
#import the dataset
friends = pd.read_csv('../../Datasets/friends-modeling.csv')
friends.head()

Unnamed: 0,season,episode,character,dialogue
0,s01,e01,Monica Geller,There's nothing to tell! He's just some guy I ...
1,s01,e01,Joey Tribbiani,"C'mon, you're going out with the guy! There's ..."
2,s01,e01,Chandler Bing,"All right Joey, be nice. So does he have a hum..."
3,s01,e01,Phoebe Buffay,"Wait, does he eat chalk?"
4,s01,e01,Phoebe Buffay,"Just, 'cause, I don't want her to go through w..."


In [3]:
#Make sure no nulls
friends.isnull().sum()

season       0
episode      0
character    0
dialogue     0
dtype: int64

### Make X and y values 

In [4]:
X = friends['dialogue']
y = friends['character']

### Split into Train and Test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Baseline Accuracy 

In [10]:
y_train.value_counts(normalize=True)

Rachel Green      0.177657
Ross Geller       0.177152
Chandler Bing     0.169355
Monica Geller     0.167242
Joey Tribbiani    0.160725
Phoebe Buffay     0.147869
Name: character, dtype: float64

### Instaniating Count Vectorizer, Fit and Transform 

In [11]:
cv = CountVectorizer()

In [12]:
cv.fit(X_train)

CountVectorizer()

In [13]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

### Modeling with 150 estimators

In [45]:
abc_150 = AdaBoostClassifier(random_state=42, n_estimators=150)
abc_150.fit(X_train_cv, y_train)

AdaBoostClassifier(n_estimators=150, random_state=42)

In [96]:
print(f'Train score: {abc_150.score(X_train_cv, y_train)}\nTest score: {abc_150.score(X_test_cv, y_test)}')

Train score: 0.2887453874538745
Test score: 0.28182467416532764


---
**Making Predictions**

In [47]:
preds_150 = abc_150.predict(X_test_cv)

---
**Making a dataframe with actual, predictions, and dialogue**

In [49]:
df_params_150 = pd.DataFrame(y_test)
df_params_150['predictions'] = preds_150
df_params_150['dialogue'] = X_test
df_params_150.rename(columns={'character': 'actual'}, inplace=True)
df_params_150.head(10)

Unnamed: 0,actual,predictions,dialogue
8260,Monica Geller,Chandler Bing,Then what's the problem?
12970,Phoebe Buffay,Rachel Green,"Yeah, well, everybody does! I'm a really cool ..."
9682,Rachel Green,Rachel Green,What? What? He's interested in you. He-he like...
22017,Monica Geller,Monica Geller,I've never loved anybody as much as I love you.
5611,Rachel Green,Chandler Bing,And I'm in it? Then let me read it.
22331,Joey Tribbiani,Joey Tribbiani,"Yeah, I gotta go! I got an acting job. Like yo..."
18609,Monica Geller,Chandler Bing,Great. So the ball is in his court?
23737,Monica Geller,Monica Geller,"Dad, please don't pick your teeth out here! Al..."
35446,Ross Geller,Monica Geller,"Excellent! Excellent, now-now do you want anot..."
3756,Monica Geller,Monica Geller,How are you?


In [53]:
df_params_150['predictions'].value_counts()

Chandler Bing     3411
Rachel Green      2227
Monica Geller     1878
Joey Tribbiani    1542
Ross Geller       1299
Phoebe Buffay      845
Name: predictions, dtype: int64

In [52]:
df_params_150['actual'].value_counts()

Rachel Green      1991
Ross Geller       1985
Chandler Bing     1897
Monica Geller     1873
Joey Tribbiani    1800
Phoebe Buffay     1656
Name: actual, dtype: int64

In [50]:
df_params_150.loc[df_params_150['actual']!= df_params_150['predictions']].count()

actual         8045
predictions    8045
dialogue       8045
dtype: int64

In [51]:
df_params_150.loc[df_params_d['actual']== df_params_150['predictions']].count()

actual         3157
predictions    3157
dialogue       3157
dtype: int64

---
**Predicting Some Phrases**

In [75]:
#abc_150.predict(["How you doin'?"])

In [76]:
#abc_150.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [74]:
#abc_150.predict(['We were on a break!'])[0]


<br>

---
### Modeling with 250 Estimators 

In [32]:
abc_250 = AdaBoostClassifier(random_state=42, n_estimators=250)
abc_250.fit(X_train_cv, y_train)

AdaBoostClassifier(n_estimators=250, random_state=42)

In [93]:
print(f'Train score: {abc_250.score(X_train_cv, y_train)}\nTest score: {abc_250.score(X_test_cv, y_test)}')

Train score: 0.30052969884537556
Test score: 0.28512765577575433


---
**Making Predictions**

In [34]:
preds_250 = abc_250.predict(X_test_cv)

---
**Creating a dataframe with actual, predictions, and dialogue**

In [35]:
df_params_250 = pd.DataFrame(y_test)
df_params_250['predictions'] = preds_250
df_params_250['dialogue'] = X_test
df_params_250.rename(columns={'character': 'actual'}, inplace=True)
df_params_250.head(10)

Unnamed: 0,actual,predictions,dialogue
8260,Monica Geller,Chandler Bing,Then what's the problem?
12970,Phoebe Buffay,Joey Tribbiani,"Yeah, well, everybody does! I'm a really cool ..."
9682,Rachel Green,Rachel Green,What? What? He's interested in you. He-he like...
22017,Monica Geller,Monica Geller,I've never loved anybody as much as I love you.
5611,Rachel Green,Chandler Bing,And I'm in it? Then let me read it.
22331,Joey Tribbiani,Joey Tribbiani,"Yeah, I gotta go! I got an acting job. Like yo..."
18609,Monica Geller,Chandler Bing,Great. So the ball is in his court?
23737,Monica Geller,Monica Geller,"Dad, please don't pick your teeth out here! Al..."
35446,Ross Geller,Monica Geller,"Excellent! Excellent, now-now do you want anot..."
3756,Monica Geller,Monica Geller,How are you?


In [39]:
df_params_250['predictions'].value_counts()

Chandler Bing     3227
Rachel Green      2193
Monica Geller     1951
Joey Tribbiani    1574
Ross Geller       1306
Phoebe Buffay      951
Name: predictions, dtype: int64

In [38]:
df_params_250['actual'].value_counts()

Rachel Green      1991
Ross Geller       1985
Chandler Bing     1897
Monica Geller     1873
Joey Tribbiani    1800
Phoebe Buffay     1656
Name: actual, dtype: int64

In [36]:
df_params_250.loc[df_params_250['actual']!= df_params_250['predictions']].count()

actual         8008
predictions    8008
dialogue       8008
dtype: int64

In [37]:
df_params_250.loc[df_params_250['actual']== df_params_250['predictions']].count()

actual         3194
predictions    3194
dialogue       3194
dtype: int64

---
**Predicting Some Phrases**

In [77]:
#abc_250.predict(["How you doin'?"])[0]

In [78]:
#abc_250.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [79]:
#abc_250.predict(['We were on a break!'])[0]


<br>

----
### Modeling with 500 Estimators

In [54]:
abc_500 = AdaBoostClassifier(random_state=42, n_estimators=150)
abc_500.fit(X_train_cv, y_train)

AdaBoostClassifier(n_estimators=150, random_state=42)

In [94]:
print(f'Train score: {abc_500.score(X_train_cv, y_train)}\nTest score: {abc_500.score(X_test_cv, y_test)}')

Train score: 0.2887453874538745
Test score: 0.28182467416532764


In [56]:
preds_500 = abc_500.predict(X_test_cv)

In [57]:
df_params_500 = pd.DataFrame(y_test)
df_params_500['predictions'] = preds_500
df_params_500['dialogue'] = X_test
df_params_500.rename(columns={'character': 'actual'}, inplace=True)
df_params_500.head(10)

Unnamed: 0,actual,predictions,dialogue
8260,Monica Geller,Chandler Bing,Then what's the problem?
12970,Phoebe Buffay,Rachel Green,"Yeah, well, everybody does! I'm a really cool ..."
9682,Rachel Green,Rachel Green,What? What? He's interested in you. He-he like...
22017,Monica Geller,Monica Geller,I've never loved anybody as much as I love you.
5611,Rachel Green,Chandler Bing,And I'm in it? Then let me read it.
22331,Joey Tribbiani,Joey Tribbiani,"Yeah, I gotta go! I got an acting job. Like yo..."
18609,Monica Geller,Chandler Bing,Great. So the ball is in his court?
23737,Monica Geller,Monica Geller,"Dad, please don't pick your teeth out here! Al..."
35446,Ross Geller,Monica Geller,"Excellent! Excellent, now-now do you want anot..."
3756,Monica Geller,Monica Geller,How are you?


In [58]:
df_params_500.loc[df_params_150['actual']!= df_params_500['predictions']].count()

actual         8045
predictions    8045
dialogue       8045
dtype: int64

In [59]:
df_params_500.loc[df_params_d['actual']== df_params_500['predictions']].count()

actual         3157
predictions    3157
dialogue       3157
dtype: int64

In [60]:
df_params_500['actual'].value_counts()

Rachel Green      1991
Ross Geller       1985
Chandler Bing     1897
Monica Geller     1873
Joey Tribbiani    1800
Phoebe Buffay     1656
Name: actual, dtype: int64

In [61]:
df_params_500['predictions'].value_counts()

Chandler Bing     3411
Rachel Green      2227
Monica Geller     1878
Joey Tribbiani    1542
Ross Geller       1299
Phoebe Buffay      845
Name: predictions, dtype: int64

---
**Predicting Some Phrases**

In [82]:
#abc_500.predict(["How you doin'?"])[0]

In [81]:
#abc_500.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

In [80]:
#abc_500.predict(['We were on a break!'])[0]


<br>

---
### Using Pipeline and GridSearch to find the Best Parameters

#### Pipeline for all GridSearch

In [62]:
pipe = Pipeline(steps= [('cv_g', CountVectorizer()),
                       ('abc_g', AdaBoostClassifier(random_state=42))])

#### GridSearch with Different Parameters

In [63]:
grid_1 = {'cv_g__stop_words': [None, 'english'],
         'cv_g__min_df': [0, 1, 2],
          'abc_g__n_estimators': [50, 100, 150, 175, 200, 250, 275]
         }

In [64]:
gs_1 = GridSearchCV(estimator= pipe, param_grid= grid_1)
gs_1.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv_g', CountVectorizer()),
                                       ('abc_g',
                                        AdaBoostClassifier(random_state=42))]),
             param_grid={'abc_g__n_estimators': [50, 100, 150, 175, 200, 250,
                                                 275],
                         'cv_g__min_df': [0, 1, 2],
                         'cv_g__stop_words': [None, 'english']})

In [65]:
gs_1.best_params_

{'abc_g__n_estimators': 275, 'cv_g__min_df': 2, 'cv_g__stop_words': None}

In [95]:
print(f'Train score: {gs_1.score(X_train, y_train)}\nTest score: {gs_1.score(X_test, y_test)}')

Train score: 0.3031781930722533
Test score: 0.2855740046420282


---
**Making Predictions**

In [83]:
preds_1 = gs_1.predict(X_test)

---
**Creating dataframe with actual, predictions, and dialogue**

In [85]:
gs_df_1 = pd.DataFrame(y_test)
gs_df_1['predictions'] = preds_1
gs_df_1['dialogue'] = X_test
gs_df_1.rename(columns={'character': 'actual'}, inplace=True)
gs_df_1.head(10)

Unnamed: 0,actual,predictions,dialogue
8260,Monica Geller,Chandler Bing,Then what's the problem?
12970,Phoebe Buffay,Chandler Bing,"Yeah, well, everybody does! I'm a really cool ..."
9682,Rachel Green,Rachel Green,What? What? He's interested in you. He-he like...
22017,Monica Geller,Monica Geller,I've never loved anybody as much as I love you.
5611,Rachel Green,Chandler Bing,And I'm in it? Then let me read it.
22331,Joey Tribbiani,Joey Tribbiani,"Yeah, I gotta go! I got an acting job. Like yo..."
18609,Monica Geller,Chandler Bing,Great. So the ball is in his court?
23737,Monica Geller,Monica Geller,"Dad, please don't pick your teeth out here! Al..."
35446,Ross Geller,Monica Geller,"Excellent! Excellent, now-now do you want anot..."
3756,Monica Geller,Monica Geller,How are you?


In [86]:
gs_df_1['predictions'].value_counts()

Chandler Bing     3096
Rachel Green      2200
Monica Geller     2032
Joey Tribbiani    1577
Ross Geller       1345
Phoebe Buffay      952
Name: predictions, dtype: int64

In [87]:
gs_df_1['actual'].value_counts()

Rachel Green      1991
Ross Geller       1985
Chandler Bing     1897
Monica Geller     1873
Joey Tribbiani    1800
Phoebe Buffay     1656
Name: actual, dtype: int64

In [88]:
gs_df_1.loc[gs_df_1['actual']!= gs_df_1['predictions']].count()

actual         8003
predictions    8003
dialogue       8003
dtype: int64

In [89]:
gs_df_1.loc[gs_df_1['actual']== gs_df_1['predictions']].count()

actual         3199
predictions    3199
dialogue       3199
dtype: int64

---
**Predicting Some Phrases**

In [90]:
gs_1.predict(["How you doin'?"])[0]

'Joey Tribbiani'

In [91]:
gs_1.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

'Monica Geller'

In [92]:
gs_1.predict(['We were on a break!'])[0]

'Chandler Bing'

<br>

---
### More Parameters

In [97]:
grid_2 = {'cv_g__stop_words': [None, 'english'],
         'cv_g__min_df': [2, 3, 4, 5],
          'abc_g__n_estimators': [275, 300, 325, 350, 375, 400]
         }

In [98]:
gs_2 = GridSearchCV(estimator= pipe, param_grid= grid_2)
gs_2.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv_g', CountVectorizer()),
                                       ('abc_g',
                                        AdaBoostClassifier(random_state=42))]),
             param_grid={'abc_g__n_estimators': [275, 300, 325, 350, 375, 400],
                         'cv_g__min_df': [2, 3, 4, 5],
                         'cv_g__stop_words': [None, 'english']})

In [99]:
gs_2.best_params_

{'abc_g__n_estimators': 375, 'cv_g__min_df': 2, 'cv_g__stop_words': None}

In [100]:
print(f'Train score: {gs_2.score(X_train, y_train)}\nTest score: {gs_2.score(X_test, y_test)}')

Train score: 0.3119866682537793
Test score: 0.2870915907873594


---
**Making Predictions**

In [101]:
preds_2 = gs_2.predict(X_test)

---
**Creating dataframe with actual, predictions, and dialogue**

In [102]:
gs_df_2 = pd.DataFrame(y_test)
gs_df_2['predictions'] = preds_2
gs_df_2['dialogue'] = X_test
gs_df_2.rename(columns={'character': 'actual'}, inplace=True)
gs_df_2.head(10)

Unnamed: 0,actual,predictions,dialogue
8260,Monica Geller,Chandler Bing,Then what's the problem?
12970,Phoebe Buffay,Rachel Green,"Yeah, well, everybody does! I'm a really cool ..."
9682,Rachel Green,Rachel Green,What? What? He's interested in you. He-he like...
22017,Monica Geller,Monica Geller,I've never loved anybody as much as I love you.
5611,Rachel Green,Chandler Bing,And I'm in it? Then let me read it.
22331,Joey Tribbiani,Joey Tribbiani,"Yeah, I gotta go! I got an acting job. Like yo..."
18609,Monica Geller,Chandler Bing,Great. So the ball is in his court?
23737,Monica Geller,Monica Geller,"Dad, please don't pick your teeth out here! Al..."
35446,Ross Geller,Monica Geller,"Excellent! Excellent, now-now do you want anot..."
3756,Monica Geller,Monica Geller,How are you?


In [103]:
gs_df_2['predictions'].value_counts()

Chandler Bing     2931
Rachel Green      2205
Monica Geller     2034
Joey Tribbiani    1596
Ross Geller       1435
Phoebe Buffay     1001
Name: predictions, dtype: int64

In [104]:
gs_df_2['actual'].value_counts()

Rachel Green      1991
Ross Geller       1985
Chandler Bing     1897
Monica Geller     1873
Joey Tribbiani    1800
Phoebe Buffay     1656
Name: actual, dtype: int64

In [105]:
gs_df_2.loc[gs_df_1['actual']!= gs_df_2['predictions']].count()

actual         7986
predictions    7986
dialogue       7986
dtype: int64

In [106]:
gs_df_2.loc[gs_df_1['actual']== gs_df_2['predictions']].count()

actual         3216
predictions    3216
dialogue       3216
dtype: int64

---
**Predicting Some Phrases**

In [107]:
gs_2.predict(["How you doin'?"])[0]

'Joey Tribbiani'

In [108]:
gs_2.predict(['Smelly cat, smelly cat, what are they feeding you'])[0]

'Rachel Green'

In [111]:
gs_2.predict(['We were on a break!'])[0]

'Chandler Bing'

<br>

---- 
### Exporting the Model Using Pickle

In [112]:
#Using the best model which was the 
with open('../ada-boost.pkl', mode='wb') as pickle_out:
    pickle.dump(gs_2, pickle_out)