In [1]:
# DataFrame
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from gensim.models import Word2Vec, Doc2Vec
import numpy as np
import pickle

In [2]:
# DATASET
DATASET_ENCODING = "UTF-8"

TRAIN_FILE_NAME = "train_data_prepared.csv"
TEST_FILE_NAME = "test_data_prepared.csv"

In [3]:
df = pd.read_csv(TRAIN_FILE_NAME, encoding=DATASET_ENCODING)

print(df.info())
print(df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   sentiment         1600000 non-null  int64 
 1   ids               1600000 non-null  int64 
 2   date              1600000 non-null  object
 3   flag              1600000 non-null  object
 4   user              1600000 non-null  object
 5   text              1600000 non-null  object
 6   mapped_sentiment  1600000 non-null  object
 7   tokenized_text    1600000 non-null  object
 8   stemmed_text      1592325 non-null  object
 9   lemmatized_text   1592325 non-null  object
dtypes: int64(2), object(8)
memory usage: 122.1+ MB
None
   sentiment         ids                       date      flag  \
0          0  1467810369  2009-04-06 23:12:45-07:00  NO_QUERY   
1          0  1467810672  2009-04-06 23:12:49-07:00  NO_QUERY   
2          0  1467810917  2009-04-06 23:12:53-07:00  NO_QUE

In [5]:
df_test = pd.read_csv(TEST_FILE_NAME, encoding=DATASET_ENCODING)

print(df_test.info())
print(df_test.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentiment         498 non-null    int64 
 1   ids               498 non-null    int64 
 2   date              498 non-null    object
 3   flag              498 non-null    object
 4   user              498 non-null    object
 5   text              498 non-null    object
 6   mapped_sentiment  498 non-null    object
 7   tokenized_text    498 non-null    object
 8   stemmed_text      498 non-null    object
 9   lemmatized_text   498 non-null    object
dtypes: int64(2), object(8)
memory usage: 39.0+ KB
None
   sentiment  ids                       date     flag      user  \
0          4    3  2009-05-11 03:17:40+00:00  kindle2    tpryan   
1          4    4  2009-05-11 03:18:03+00:00  kindle2    vcu451   
2          4    5  2009-05-11 03:18:54+00:00  kindle2    chadfu   
3          4    6  

In [6]:
df.dropna(subset=['lemmatized_text'], how = 'all', inplace = True)
df_test.dropna(subset=['lemmatized_text'], how = 'all', inplace = True)

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(df.lemmatized_text)
X_test = vectorizer.transform(df_test.lemmatized_text)
y_train = df.mapped_sentiment
y_test = df_test.mapped_sentiment

In [27]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [30]:
with open('model.pkl','wb') as f:
    pickle.dump(clf,f)


In [41]:
y_pred = clf.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5301204819277109

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.57      0.72      0.64       177
     NEUTRAL       0.00      0.00      0.00       139
    POSITIVE       0.50      0.75      0.60       182

    accuracy                           0.53       498
   macro avg       0.36      0.49      0.41       498
weighted avg       0.38      0.53      0.44       498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
X_test_without_neutral = vectorizer.transform(df_test[df_test['mapped_sentiment'] != 'NEUTRAL'].lemmatized_text)
y_test_without_neutral = df_test[df_test['mapped_sentiment'] != 'NEUTRAL'].mapped_sentiment

In [43]:
y_pred_without_neutral = clf.predict(X_test_without_neutral)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test_without_neutral, y_pred_without_neutral))
print("\nClassification Report:\n", classification_report(y_test_without_neutral, y_pred_without_neutral))

Accuracy: 0.7353760445682451

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.74      0.72      0.73       177
    POSITIVE       0.73      0.75      0.74       182

    accuracy                           0.74       359
   macro avg       0.74      0.74      0.74       359
weighted avg       0.74      0.74      0.74       359



In [8]:
df.dropna(subset=['stemmed_text'], how = 'all', inplace = True)
df_test.dropna(subset=['stemmed_text'], how = 'all', inplace = True)

In [12]:
vectorizer_stemmed = TfidfVectorizer(stop_words='english')
X_train_stemmed = vectorizer_stemmed.fit_transform(df.stemmed_text)
X_test_stemmed = vectorizer_stemmed.transform(df_test.stemmed_text)
y_train_stemmed = df.mapped_sentiment
y_test_stemmed = df_test.mapped_sentiment

In [13]:
clf_stemmed = DecisionTreeClassifier(random_state=42)
clf_stemmed.fit(X_train_stemmed, y_train_stemmed)

In [14]:
with open('model_stemmed.pkl','wb') as f:
    pickle.dump(clf_stemmed,f)

In [16]:
y_pred_stemmed = clf_stemmed.predict(X_test_stemmed)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test_stemmed, y_pred_stemmed))
print("\nClassification Report:\n", classification_report(y_test_stemmed, y_pred_stemmed))

Accuracy: 0.5321285140562249

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.56      0.71      0.63       177
     NEUTRAL       0.00      0.00      0.00       139
    POSITIVE       0.51      0.77      0.61       182

    accuracy                           0.53       498
   macro avg       0.36      0.49      0.41       498
weighted avg       0.39      0.53      0.45       498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
X_test_stemmed_without_neutral = vectorizer.transform(df_test[df_test['mapped_sentiment'] != 'NEUTRAL'].lemmatized_text)
y_test_stemmed_without_neutral = df_test[df_test['mapped_sentiment'] != 'NEUTRAL'].mapped_sentiment

In [19]:
y_pred_without_neutral = clf_stemmed.predict(X_test_stemmed_without_neutral)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test_stemmed_without_neutral, y_pred_without_neutral))
print("\nClassification Report:\n", classification_report(y_test_stemmed_without_neutral, y_pred_without_neutral))

Accuracy: 0.6908077994428969

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.69      0.67      0.68       177
    POSITIVE       0.69      0.71      0.70       182

    accuracy                           0.69       359
   macro avg       0.69      0.69      0.69       359
weighted avg       0.69      0.69      0.69       359



In [13]:
vector_size = 500
w2v_model = Word2Vec([_text.split(" ") for _text in df.lemmatized_text], min_count = 1, vector_size = vector_size, workers = 4, sg = 1)

In [47]:
w2v_model.wv['a']

KeyError: "Key 'a' not present"

In [15]:
word2vec_filename = 'train_word2vec.csv'
y_train_w2v = []
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in df.iterrows():
        tokens = [token for token in row['lemmatized_text'] if token in w2v_model.wv.key_to_index]
        if len(tokens) > 0:
            y_train_w2v.append(row['mapped_sentiment'])
            model_vector = (np.mean([w2v_model.wv[token] for token in tokens], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(vector_size))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(vector_size)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')

In [16]:
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

# Fit the model
clf_decision_word2vec.fit(word2vec_df,y_train_w2v)

In [18]:
test_features_word2vec = []
y_test_w2v = []
for index, row in df_test[df_test['mapped_sentiment'] != 'NEUTRAL'].iterrows():
    tokens = [token for token in row['lemmatized_text'] if token in w2v_model.wv.key_to_index]
    if len(tokens) > 0:
        y_test_w2v.append(row['mapped_sentiment'])
        model_vector = (np.mean([w2v_model.wv[token] for token in tokens], axis=0)).tolist()
        if type(model_vector) is list:
            test_features_word2vec.append(model_vector)
        else:
            test_features_word2vec.append(np.array([0 for i in range(vector_size)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(classification_report(y_test_w2v,test_predictions_word2vec))

              precision    recall  f1-score   support

    NEGATIVE       0.50      0.51      0.51       177
    POSITIVE       0.52      0.51      0.51       182

    accuracy                           0.51       359
   macro avg       0.51      0.51      0.51       359
weighted avg       0.51      0.51      0.51       359



