In [1]:
cd ..

/Users/sukyee/Desktop/team5


In [2]:
import os
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
from src.datapipeline.clean_data import clean_pipeline
from src.datapipeline.loader import Loader
from src.modelling import model_tfidf

[nltk_data] Downloading package punkt to /Users/sukyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/sukyee/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sukyee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sukyee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sukyee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

### Loading Data

In [4]:
path = os.getcwd() + '/clean_data'

train_data = path + '/clean_train.csv'
test_data = path + '/clean_test.csv'
val_data = path + '/clean_val.csv'

In [5]:
df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)
df_val = pd.read_csv(val_data)

In [6]:
X_train = df_train.drop(columns=['sentiment'])
y_train = df_train['sentiment']
X_test = df_test.drop(columns=['sentiment'])
y_test = df_test['sentiment']
X_val = df_val.drop(columns=['sentiment'])
y_val = df_val['sentiment']

### Create TFIDF

In [8]:
tfidf= TfidfVectorizer(strip_accents='ascii', stop_words='english')

corpus = X_train['tweet'].values
vec = tfidf.fit_transform(corpus)

In [10]:
doc_1_df = pd.DataFrame( {'vocab': tfidf.get_feature_names(),'word_count': vec.toarray()[0] } ) 

In [24]:
doc_1_df = doc_1_df.sort_values(by='word_count', ascending = True)

In [25]:
doc_1_df.head(10)

Unnamed: 0,vocab,word_count
0,aa,0.0
4,aback,0.0
5,abacus,0.0
6,abandon,0.0
7,abandoned,0.0
8,abandonedly,0.0
9,abandonment,0.0
10,abate,0.0
11,abb,0.0
12,abbas,0.0


In [26]:
doc_1_df_filtered = doc_1_df[doc_1_df['word_count']!=0]

In [27]:
doc_1_df_filtered.head(10)

Unnamed: 0,vocab,word_count
3764,corona,0.071144
11287,new,0.258463
14445,right,0.264762
1531,believe,0.306153
2132,bring,0.311538
10668,mind,0.336828
2834,child,0.339729
15165,set,0.353323
15926,speak,0.359999
4137,cute,0.428889


### Decision Tree Regressor Model

In [28]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()

In [29]:
dtr.fit(vec, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [30]:
val_corpus = X_val['tweet'].values
# val_vec = cv.transform(val_corpus)
val_tfidt = tfidf.transform(val_corpus)


In [46]:
print(val_tfidt.shape)

(23581, 19473)


In [31]:
val_predict = dtr.predict(val_tfidt) 

In [32]:
mae_val = mae(y_val, val_predict).round(3)
mse_val = mse(y_val, val_predict).round(3)

print("Model: Decision Tree Regressor")
print("Mean Absolute Error: ", mae_val)
print("Mean Squared Error", mse_val)
print("Root Mean Squared Error", np.sqrt(mse_val).round(3))


Model: Decision Tree Regressor
Mean Absolute Error:  0.099
Mean Squared Error 0.038
Root Mean Squared Error 0.195


### KNN model

In [16]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [17]:
knn.fit(tfidt, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [18]:
val_predict_knn = knn.predict(val_tfidt)

In [21]:
mae_val_knn = mae(y_val, val_predict_knn).round(3)
mse_val_knn = mse(y_val, val_predict_knn).round(3)
print("Model: K Nearest Neighbors")
print("Mean Absolute Error: ", mae_val_knn)
print("Mean Squared Error", mse_val_knn)
print("Root Mean Squared Error", np.sqrt(mse_val_knn).round(3))

Model: K Nearest Neighbors
Mean Absolute Error:  0.165
Mean Squared Error 0.054
Root Mean Squared Error 0.232


### Testing the Model

In [33]:
test_corpus = X_test['tweet'].values
# test_vec = cv.transform(test_corpus)
test_tfidt = tfidf.transform(test_corpus)

In [48]:
print(test_tfidt.shape)

(23231, 19473)


In [34]:
predict = dtr.predict(test_tfidt) 

In [35]:
mae_test = mae(y_test, predict).round(3)
mse_test = mse(y_test, predict).round(3)

print("Model: Decision Tree Regressor")
print("Mean Absolute Error: ", mae_test)
print("Mean Squared Error", mse_test)
print("Root Mean Squared Error", np.sqrt(mse_test).round(3))

Model: Decision Tree Regressor
Mean Absolute Error:  0.099
Mean Squared Error 0.038
Root Mean Squared Error 0.195


In [25]:
predict_knn = knn.predict(test_tfidt) 

In [26]:
mae_test_knn = mae(y_test, predict_knn).round(3)
mse_test_knn = mse(y_test, predict_knn).round(3)
print("Model: K Nearest Neighbors")
print("Mean Absolute Error: ", mae_test_knn)
print("Mean Squared Error", mse_test_knn)
print("Root Mean Squared Error", np.sqrt(mse_test_knn).round(3))

Model: K Nearest Neighbors
Mean Absolute Error:  0.164
Mean Squared Error 0.054
Root Mean Squared Error 0.232
