# Modules Import

In [212]:
%%time
import pandas as pd
import numpy as np

import sys
import subprocess
import re

import nltk
import spacy
import random
import torch

from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from autocorrect import Speller
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

CPU times: user 28 µs, sys: 7 µs, total: 35 µs
Wall time: 37 µs


# Preparing the Environment and Resources

In [213]:
%%time
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
    
stemmer: PorterStemmer = PorterStemmer()
spell: Speller = Speller(lang="en")
tfidf_vect: TfidfVectorizer = TfidfVectorizer()

nltk.download("stopwords")
stop_words: set[str] = set(stopwords.words("english"))


try:
    subprocess.check_output(["python3", "-m", "spacy", "download", "en_core_web_sm"])
except Exception as e:
    subprocess.check_output(["python3", "-m", "download", "en_core_web_sm"])

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gerrenme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 264 ms, sys: 23.1 ms, total: 287 ms
Wall time: 15.2 s


# Data Preprocessing

In [214]:
%%time
def df_preprocessing(filepath: str) -> pd.DataFrame:
    new_df: pd.DataFrame = pd.read_csv(filepath)
    new_df = new_df.T

    new_df.reset_index(inplace=True)
    new_df.columns = ["tweets"]
    new_df["tweets"] = new_df["tweets"].apply(lambda words: words.lower())

    return new_df


negative_df: pd.DataFrame = df_preprocessing("datasets/processedNegative.csv")
positive_df: pd.DataFrame = df_preprocessing("datasets/processedPositive.csv")
neutral_df: pd.DataFrame = df_preprocessing("datasets/processedNeutral.csv")

negative_df["mood"] = "negative"
positive_df["mood"] = "positive"
neutral_df["mood"] = "neutral"

tweets_df: pd.DataFrame = pd.concat([neutral_df, positive_df, negative_df])

display(negative_df.head(3))

Unnamed: 0,tweets,mood
0,how unhappy some dogs like it though,negative
1,talking to my over driver about where i'm goin...,negative
2,does anybody know if the rand's likely to fall...,negative


CPU times: user 130 ms, sys: 669 µs, total: 131 ms
Wall time: 130 ms


In [215]:
%%time
def convert_label_to_numeric(label):
    if label == "positive":
        return 1
    elif label == "negative":
        return -1
    else:
        return 0


tweets_df = tweets_df.sample(frac=1).reset_index(drop=True)
tweets_df["mood"] = tweets_df["mood"].apply(convert_label_to_numeric)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood
0,can you do that when player kills you. then th...,1
1,kiera's quest: awakenings (kiera's quest book 1),1
2,i really enjoyed horrizion zero dawn but no id...,-1


CPU times: user 3.66 ms, sys: 0 ns, total: 3.66 ms
Wall time: 3.25 ms


In [216]:
%%time
def remove_punctuation(text: str) -> str:
    return re.sub(r"[^a-zA-Z\s]+", "", text)


tweets_df["tweets"] = tweets_df["tweets"].apply(remove_punctuation)
display(tweets_df.head(3))

Unnamed: 0,tweets,mood
0,can you do that when player kills you then the...,1
1,kieras quest awakenings kieras quest book,1
2,i really enjoyed horrizion zero dawn but no id...,-1


CPU times: user 8.36 ms, sys: 0 ns, total: 8.36 ms
Wall time: 8.05 ms


In [217]:
def drop_stop_words(df: pd.DataFrame) -> pd.DataFrame:
    df["tweets"] = df["tweets"].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
    return df

tweets_df = drop_stop_words(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood
0,player kills player rank would show happy,1
1,kieras quest awakenings kieras quest book,1
2,really enjoyed horrizion zero dawn idea play n...,-1


In [218]:
%%time
def add_tokens(df: pd.DataFrame) -> pd.DataFrame:
    df["tokens"] = df["tweets"].apply(word_tokenize)
    return df


tweets_df = add_tokens(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]"
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]"
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,..."


CPU times: user 127 ms, sys: 0 ns, total: 127 ms
Wall time: 126 ms


In [219]:
%%time
def add_stemming(df: pd.DataFrame) -> pd.DataFrame:
    if "tokens" not in df.columns:
        df = add_tokens(df)
        
    df["stemmed_text"] = df["tokens"].apply(lambda words: [stemmer.stem(word) for word in words])
    df["stemmed_text"] = df["stemmed_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_stemming(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...


CPU times: user 137 ms, sys: 78 µs, total: 137 ms
Wall time: 137 ms


In [220]:
%%time
def add_lemma(df: pd.DataFrame) -> pd.DataFrame:
    if "tokens" not in df.columns:
        df = add_tokens(df)

    df["lemmatized_text"] = df["tokens"].apply(lambda words: [token.lemma_ for token in nlp(" ".join(words))])
    df["lemmatized_text"] = df["lemmatized_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_lemma(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...


CPU times: user 8.2 s, sys: 8.45 ms, total: 8.21 s
Wall time: 8.21 s


In [221]:
%%time
def add_misspellings(df: pd.DataFrame) -> pd.DataFrame:
    if "tokens" not in df.columns:
        df = add_tokens(df)
        
    df["misspell_text"] = df["tokens"].apply(lambda words: [spell(word) for word in words])
    df["misspell_text"] = df["misspell_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_misspellings(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy,player kills player rank would show happy
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book,skiers quest awakening skiers quest book
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...,really enjoyed horizon zero dawn idea play nex...


CPU times: user 35.7 s, sys: 30.4 ms, total: 35.7 s
Wall time: 35.7 s


In [222]:
%%time
def add_lemma_misspellings(df: pd.DataFrame) -> pd.DataFrame:
    if "lemmatized_text" not in df.columns:
        df = add_lemma(df)
        
    df["lemma_misspell_text"] = df["lemmatized_text"].apply(lambda words: [spell(word) for word in words.split()])
    df["lemma_misspell_text"] = df["lemma_misspell_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_lemma_misspellings(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy,player kills player rank would show happy,player kill player rank would show happy
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book,skiers quest awakening skiers quest book,skiers quest awakening skiers quest book
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...,really enjoyed horizon zero dawn idea play nex...,really enjoy horizon zero dawn idea play next ...


CPU times: user 35.5 s, sys: 27.7 ms, total: 35.5 s
Wall time: 35.5 s


# Creating a Bag Of Words and Adding New Attributes

In [223]:
%%time
def get_bag_of_words(df: pd.DataFrame) -> list[str]:
    bag: list[str] = []
    if "misspell_text" in df.columns:
        words: set[str] = set([token.strip() for tokens in df["misspell_text"].values for token in tokens.split()])
        bag = sorted(words)
    
    return bag


def add_binary_words(df: pd.DataFrame) -> pd.DataFrame:
    bag_of_words: list[str] = get_bag_of_words(df)
    for col in df.columns:
        if col in bag_of_words:
            bag_of_words.remove(col)
    
    for word in bag_of_words:
        df[word] = df["misspell_text"].apply(lambda row: 1 if word in row else 0)
        
    return df


binary_tweets_df: pd.DataFrame = tweets_df.copy()
binary_tweets_df = add_binary_words(binary_tweets_df)

display(binary_tweets_df.head(3))



Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,aa,aaaaaa,aaffaa,...,yoyoyou,yr,yrs,yuri,zabardast,zac,zealand,zero,zoo,zoos
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy,player kills player rank would show happy,player kill player rank would show happy,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book,skiers quest awakening skiers quest book,skiers quest awakening skiers quest book,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...,really enjoyed horizon zero dawn idea play nex...,really enjoy horizon zero dawn idea play next ...,0,0,0,...,0,0,0,0,0,0,0,1,0,0


CPU times: user 6.42 s, sys: 13.4 ms, total: 6.43 s
Wall time: 6.41 s


In [224]:
%%time
def add_words_count(df: pd.DataFrame) -> pd.DataFrame: 
    bag_of_words: list[str] = get_bag_of_words(df)
    for col in df.columns:
        if col in bag_of_words:
            bag_of_words.remove(col)
        
    for word in bag_of_words:
        df[word] = df["misspell_text"].apply(lambda row: row.count(word))
        
    return df


count_tweets_df: pd.DataFrame = tweets_df.copy()
count_tweets_df = add_words_count(count_tweets_df)

display(count_tweets_df.head(3))



Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,aa,aaaaaa,aaffaa,...,yoyoyou,yr,yrs,yuri,zabardast,zac,zealand,zero,zoo,zoos
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy,player kills player rank would show happy,player kill player rank would show happy,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book,skiers quest awakening skiers quest book,skiers quest awakening skiers quest book,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...,really enjoyed horizon zero dawn idea play nex...,really enjoy horizon zero dawn idea play next ...,0,0,0,...,0,0,0,0,0,0,0,1,0,0


CPU times: user 7.4 s, sys: 40 ms, total: 7.44 s
Wall time: 7.4 s


In [225]:
%%time
def add_tfidf(df: pd.DataFrame) -> pd.DataFrame:
    tfidf_matrix = tfidf_vect.fit_transform(df["misspell_text"])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                            columns=tfidf_vect.get_feature_names_out(), index=df.index)

    df["words_tfidf"] = tfidf_df.values.tolist()
    return df


tfidf_tweets_df: pd.DataFrame = tweets_df.copy()
tfidf_tweets_df = add_tfidf(tfidf_tweets_df)

display(tfidf_tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,words_tfidf
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy,player kills player rank would show happy,player kill player rank would show happy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book,skiers quest awakening skiers quest book,skiers quest awakening skiers quest book,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...,really enjoyed horizon zero dawn idea play nex...,really enjoy horizon zero dawn idea play next ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


CPU times: user 291 ms, sys: 163 ms, total: 454 ms
Wall time: 452 ms


In [226]:
%%time
def add_word2vec(df: pd.DataFrame, vector_size: int = 100, min_count: int = 1, 
                 window: int = 5, epochs: int = 100) -> pd.DataFrame:
    
    sentences = [text.split() for text in df["misspell_text"]]
    model = Word2Vec(sentences, vector_size=vector_size, min_count=min_count, 
                     window=window, epochs=epochs)

    word2vec_vectors = []
    for text in sentences:
        if len(text) > 0:
            vector = sum(model.wv[word] for word in text if word in model.wv) / len(text)
        else:
            vector = [0] * vector_size 
        word2vec_vectors.append(vector)

    df["words_word2vec"] = word2vec_vectors
    return df


w2v_tweets_df: pd.DataFrame = tweets_df.copy()
w2v_tweets_df = add_word2vec(w2v_tweets_df)

display(w2v_tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,words_word2vec
0,player kills player rank would show happy,1,"[player, kills, player, rank, would, show, happy]",player kill player rank would show happi,player kill player rank would show happy,player kills player rank would show happy,player kill player rank would show happy,"[-0.5020763, 0.25726023, 0.11778996, -0.401104..."
1,kieras quest awakenings kieras quest book,1,"[kieras, quest, awakenings, kieras, quest, book]",kiera quest awaken kiera quest book,kieras quest awakening kieras quest book,skiers quest awakening skiers quest book,skiers quest awakening skiers quest book,"[0.008018449, 0.2715638, -0.019535685, 0.14251..."
2,really enjoyed horrizion zero dawn idea play n...,-1,"[really, enjoyed, horrizion, zero, dawn, idea,...",realli enjoy horrizion zero dawn idea play nex...,really enjoy horrizion zero dawn idea play nex...,really enjoyed horizon zero dawn idea play nex...,really enjoy horizon zero dawn idea play next ...,"[-0.41309798, 0.33599615, 0.31817555, -0.47787..."


CPU times: user 1.36 s, sys: 0 ns, total: 1.36 s
Wall time: 1.22 s


# Binary DF Prediction

In [227]:
%%time
X: pd.DataFrame = binary_tweets_df.iloc[:, 7:]
y: pd.Series = binary_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pred_results: dict[str, float] = {}

CPU times: user 234 ms, sys: 0 ns, total: 234 ms
Wall time: 233 ms


In [228]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=100, max_depth=4)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_bin"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_bin"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_bin"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 343 ms, sys: 0 ns, total: 343 ms
Wall time: 341 ms


In [229]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_bin"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_bin"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_bin"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 21.7 s, sys: 21.9 s, total: 43.7 s
Wall time: 2.4 s


In [230]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.6941935483870968
Prediction type rand_for_mae_bin; Value -- 0.31870967741935485
Prediction type rand_for_mrse_bin; Value -- 0.5869549633764571
Prediction type log_reg_accur_bin; Value -- 0.8683870967741936
Prediction type log_reg_mae_bin; Value -- 0.1664516129032258
Prediction type log_reg_mrse_bin; Value -- 0.4859310982619496


# Count DF Prediction

In [231]:
%%time
X: pd.DataFrame = count_tweets_df.iloc[:, 7:]
y: pd.Series = count_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 586 ms, sys: 1.33 s, total: 1.91 s
Wall time: 293 ms


In [232]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=22)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_cnt"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_cnt"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_cnt"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 601 ms, sys: 31.6 ms, total: 632 ms
Wall time: 632 ms


In [233]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_cnt"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_cnt"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_cnt"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 37.5 s, sys: 38.2 s, total: 1min 15s
Wall time: 4.02 s


In [234]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.6941935483870968
Prediction type rand_for_mae_bin; Value -- 0.31870967741935485
Prediction type rand_for_mrse_bin; Value -- 0.5869549633764571
Prediction type log_reg_accur_bin; Value -- 0.8683870967741936
Prediction type log_reg_mae_bin; Value -- 0.1664516129032258
Prediction type log_reg_mrse_bin; Value -- 0.4859310982619496
Prediction type rand_for_accur_cnt; Value -- 0.8154838709677419
Prediction type rand_for_mae_cnt; Value -- 0.1896774193548387
Prediction type rand_for_mrse_cnt; Value -- 0.4472135954999579
Prediction type log_reg_accur_cnt; Value -- 0.8580645161290322
Prediction type log_reg_mae_cnt; Value -- 0.1703225806451613
Prediction type log_reg_mrse_cnt; Value -- 0.4765467177450165


# TFIDF DF Prediction

In [235]:
%%time
X: pd.DataFrame = tfidf_tweets_df["words_tfidf"].to_list()
y: pd.Series = tfidf_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 195 ms, sys: 640 ms, total: 835 ms
Wall time: 44.1 ms


In [236]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=22)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_tfidf"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_tfidf"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_tfidf"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 1.39 s, sys: 397 ms, total: 1.79 s
Wall time: 1.29 s


In [237]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_tfidf"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_tfidf"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_tfidf"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 10.2 s, sys: 12.4 s, total: 22.5 s
Wall time: 1.55 s


In [238]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.6941935483870968
Prediction type rand_for_mae_bin; Value -- 0.31870967741935485
Prediction type rand_for_mrse_bin; Value -- 0.5869549633764571
Prediction type log_reg_accur_bin; Value -- 0.8683870967741936
Prediction type log_reg_mae_bin; Value -- 0.1664516129032258
Prediction type log_reg_mrse_bin; Value -- 0.4859310982619496
Prediction type rand_for_accur_cnt; Value -- 0.8154838709677419
Prediction type rand_for_mae_cnt; Value -- 0.1896774193548387
Prediction type rand_for_mrse_cnt; Value -- 0.4472135954999579
Prediction type log_reg_accur_cnt; Value -- 0.8580645161290322
Prediction type log_reg_mae_cnt; Value -- 0.1703225806451613
Prediction type log_reg_mrse_cnt; Value -- 0.4765467177450165
Prediction type rand_for_accur_tfidf; Value -- 0.8348387096774194
Prediction type rand_for_mae_tfidf; Value -- 0.17161290322580644
Prediction type rand_for_mrse_tfidf; Value -- 0.42955340649593043
Prediction type log_reg_accur_tfidf; Value -- 0.88
P

# Word2Vec Prediction

In [239]:
%%time
X: pd.DataFrame = w2v_tweets_df["words_word2vec"].to_list()
y: pd.Series = w2v_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 31.6 ms, sys: 9.31 ms, total: 41 ms
Wall time: 2.55 ms


In [240]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=22)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_w2v"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_w2v"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_w2v"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 1.35 s, sys: 1.05 s, total: 2.41 s
Wall time: 971 ms


In [241]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_w2v"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_w2v"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_w2v"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 687 ms, sys: 1.3 s, total: 1.99 s
Wall time: 110 ms


In [242]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.6941935483870968
Prediction type rand_for_mae_bin; Value -- 0.31870967741935485
Prediction type rand_for_mrse_bin; Value -- 0.5869549633764571
Prediction type log_reg_accur_bin; Value -- 0.8683870967741936
Prediction type log_reg_mae_bin; Value -- 0.1664516129032258
Prediction type log_reg_mrse_bin; Value -- 0.4859310982619496
Prediction type rand_for_accur_cnt; Value -- 0.8154838709677419
Prediction type rand_for_mae_cnt; Value -- 0.1896774193548387
Prediction type rand_for_mrse_cnt; Value -- 0.4472135954999579
Prediction type log_reg_accur_cnt; Value -- 0.8580645161290322
Prediction type log_reg_mae_cnt; Value -- 0.1703225806451613
Prediction type log_reg_mrse_cnt; Value -- 0.4765467177450165
Prediction type rand_for_accur_tfidf; Value -- 0.8348387096774194
Prediction type rand_for_mae_tfidf; Value -- 0.17161290322580644
Prediction type rand_for_mrse_tfidf; Value -- 0.42955340649593043
Prediction type log_reg_accur_tfidf; Value -- 0.88
P

# Find Most Simmilar Twits

In [243]:
tfidf_matrix = tfidf_vect.fit_transform(tfidf_tweets_df["lemma_misspell_text"])
similarity_matrix: cosine_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

similar_documents_amount: int = 10 
for t, document in enumerate(tfidf_tweets_df["lemma_misspell_text"]):
    similar_indices = similarity_matrix[t].argsort()[: -num_similar_documents - 1: -1][1: ]
    print(f"Наиболее похожие записи на '{document}':")
    
    for j in similar_indices:
        print(tweets_df.iloc[j]["lemma_misspell_text"])
    
    print()
    if t == similar_documents_amount:
        break

Наиболее похожие записи на 'player kill player rank would show happy':
perfectly happy player we ve get
kill
kill
may show
number show
kill cry joy
break rank body east
activist kill
would great trick happy

Наиболее похожие записи на 'skiers quest awakening skiers quest book':
autograph book ready
cbi book leader case
record haul book
breathe highway book
be not say read book today
be not say read book today
enter win box highly anticipate book book box open int happy
iii would like book fuck unhappy
book car service award tomorrow

Наиболее похожие записи на 'really enjoy horizon zero dawn idea play next unhappy':
enjoy happy
enjoy smile
have not play fm long time really unhappy
idea say also
I m really happy
idea wrong
idea wrong
life enjoy happy
really really need right unhappy

Наиболее похожие записи на 'heartland lab':
we nsa hr arrive today hold talk nsa
boss india large caution waive
wish could attend unhappy
may go away happy
full expect month two around subramanian
mapdata r

* Выше представлен вариант, который учитывает дубликаты постов
* Ниже представлен вариант, который не учитывает дубликаты постов

In [244]:
tfidf_matrix = tfidf_vect.fit_transform(tfidf_tweets_df["lemma_misspell_text"].drop_duplicates(keep="first"))
similarity_matrix: cosine_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

similar_documents_amount: int = 10 
for t, document in enumerate(tfidf_tweets_df["lemma_misspell_text"]):
    similar_indices = similarity_matrix[t].argsort()[: -num_similar_documents - 1: -1][1: ]
    print(f"Наиболее похожие записи на '{document}':")
    
    for j in similar_indices:
        print(tweets_df.iloc[j]["lemma_misspell_text"])
    
    print()
    if t == similar_documents_amount:
        break

Наиболее похожие записи на 'player kill player rank would show happy':
day school leave happy
army block key highway civilian suffer
haha good tweet mate happy hear big boy account king shit happensould say happy chance shit happen
back matemticas unhappy
push team use twitter tool
fuck do not unhappy
definitely arm unhappy
look could kill smile
nothing could better happy

Наиболее похожие записи на 'skiers quest awakening skiers quest book':
mayawati play divideandwoo home turf election
koala die thirst we unhappy
kp naar engagement gulf deepen
calcutta menace bear eat pavement footpath
half empty cloud research
mean voice change unhappy
dragon omg
scientist save orbiter death eclipse
high court grant stay criminal proceeding minister case file mla mou mostra

Наиболее похожие записи на 'really enjoy horizon zero dawn idea play next unhappy':
totally
yogi pm
miss louis tweet unhappy
summon
just you
union include railway budget today
show desa accidental new book tirthankar roy anand v

# Totals

* В процессе выполнения задания был обработан датафрейм (убраны стоп-слова (точность предсказаний без стоп-слов немного выше, чем с ними), добавлены лемматизация, стемминг, миссемплинг, мешок слов, а также tfidf и word2vec). наилучший результат предсказаний - 0.89 через логистическую регрессию на основе word2vec