# Modules Import

In [13]:
%%time
import pandas as pd
import numpy as np

import sys
import subprocess
import re

import nltk
import spacy
import random
import torch
import warnings

from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from autocorrect import Speller
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

CPU times: user 63 µs, sys: 0 ns, total: 63 µs
Wall time: 66 µs


# Preparing the Environment and Resources

In [14]:
%%time
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
    
stemmer: PorterStemmer = PorterStemmer()
spell: Speller = Speller(lang="en")
tfidf_vect: TfidfVectorizer = TfidfVectorizer()

nltk.download("stopwords")
stop_words: set[str] = set(stopwords.words("english"))
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)


try:
    subprocess.check_output(["python3", "-m", "spacy", "download", "en_core_web_sm"])
except Exception as e:
    subprocess.check_output(["python3", "-m", "download", "en_core_web_sm"])

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gerrenme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 262 ms, sys: 12.4 ms, total: 275 ms
Wall time: 14.6 s


# Data Preprocessing

In [3]:
%%time
def df_preprocessing(filepath: str) -> pd.DataFrame:
    """The method reads the dataframe and converts it to a readable form"""
    new_df: pd.DataFrame = pd.read_csv(filepath)
    new_df = new_df.T

    new_df.reset_index(inplace=True)
    new_df.columns = ["tweets"]
    new_df["tweets"] = new_df["tweets"].apply(lambda words: words.lower())

    return new_df


negative_df: pd.DataFrame = df_preprocessing("datasets/processedNegative.csv")
positive_df: pd.DataFrame = df_preprocessing("datasets/processedPositive.csv")
neutral_df: pd.DataFrame = df_preprocessing("datasets/processedNeutral.csv")

negative_df["mood"] = "negative"
positive_df["mood"] = "positive"
neutral_df["mood"] = "neutral"

tweets_df: pd.DataFrame = pd.concat([neutral_df, positive_df, negative_df])

display(negative_df.head(3))

Unnamed: 0,tweets,mood
0,how unhappy some dogs like it though,negative
1,talking to my over driver about where i'm goin...,negative
2,does anybody know if the rand's likely to fall...,negative


CPU times: user 221 ms, sys: 0 ns, total: 221 ms
Wall time: 228 ms


In [4]:
%%time
def convert_target_to_numeric(label):
    """The method converts the emotional connotation of a tweet from a string entry to a numeric one"""
    if label == "positive":
        return 1
    elif label == "negative":
        return -1
    else:
        return 0


tweets_df = tweets_df.sample(frac=1).reset_index(drop=True)
tweets_df["mood"] = tweets_df["mood"].apply(convert_target_to_numeric)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood
0,'shocked' by hc order on handover to cbi,0
1,up cm bans paan,0
2,hey everyone me and harry have just had pancak...,1


CPU times: user 4.98 ms, sys: 945 µs, total: 5.92 ms
Wall time: 7.62 ms


In [5]:
%%time
def remove_punctuation(text: str) -> str:
    """The method leaves only letters and spaces, removing digits and punctuation marks"""
    return re.sub(r"[^a-zA-Z\s]+", "", text)


tweets_df["tweets"] = tweets_df["tweets"].apply(remove_punctuation)
display(tweets_df.head(3))

Unnamed: 0,tweets,mood
0,shocked by hc order on handover to cbi,0
1,up cm bans paan,0
2,hey everyone me and harry have just had pancak...,1


CPU times: user 5.7 ms, sys: 1.66 ms, total: 7.37 ms
Wall time: 7.03 ms


In [6]:
def drop_stop_words(df: pd.DataFrame) -> pd.DataFrame:
    """Method removes stop words"""
    df["tweets"] = df["tweets"].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
    return df

tweets_df = drop_stop_words(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood
0,shocked hc order handover cbi,0
1,cm bans paan,0
2,hey everyone harry pancakes maple syrup breakf...,1


In [7]:
%%time
def add_tokens(df: pd.DataFrame) -> pd.DataFrame:
    """The method converts the text of a tweet into tokens (individual words)"""
    df["tokens"] = df["tweets"].apply(word_tokenize)
    return df


tweets_df = add_tokens(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]"
1,cm bans paan,0,"[cm, bans, paan]"
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,..."


CPU times: user 127 ms, sys: 1.7 ms, total: 129 ms
Wall time: 131 ms


In [8]:
%%time
def add_stemming(df: pd.DataFrame) -> pd.DataFrame:
    """The method performs stemming over the tokens and merges them into a single text"""
    if "tokens" not in df.columns:
        df = add_tokens(df)
        
    df["stemmed_text"] = df["tokens"].apply(lambda words: [stemmer.stem(word) for word in words])
    df["stemmed_text"] = df["stemmed_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_stemming(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...


CPU times: user 140 ms, sys: 0 ns, total: 140 ms
Wall time: 140 ms


In [9]:
%%time
def add_lemma(df: pd.DataFrame) -> pd.DataFrame:
    """The method performs lemmatization over the tokens and merges them into a single text"""
    if "tokens" not in df.columns:
        df = add_tokens(df)

    df["lemmatized_text"] = df["tokens"].apply(lambda words: [token.lemma_ for token in nlp(" ".join(words))])
    df["lemmatized_text"] = df["lemmatized_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_lemma(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...


CPU times: user 8.67 s, sys: 0 ns, total: 8.67 s
Wall time: 8.71 s


In [10]:
%%time
def add_misspellings(df: pd.DataFrame) -> pd.DataFrame:
    """The method performs misspelling over the tokens and merges them into a single text"""
    if "tokens" not in df.columns:
        df = add_tokens(df)
        
    df["misspell_text"] = df["tokens"].apply(lambda words: [spell(word) for word in words])
    df["misspell_text"] = df["misspell_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_misspellings(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi,shocked hc order handover cbi
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan,cm bans plan
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...,hey everyone harry pancake maple syrup breakfa...


CPU times: user 35.1 s, sys: 0 ns, total: 35.1 s
Wall time: 35.2 s


In [11]:
%%time
def add_lemma_misspellings(df: pd.DataFrame) -> pd.DataFrame:
    """The method performs lemmatization and misspelling over the tokens and merges them into a single text"""
    if "lemmatized_text" not in df.columns:
        df = add_lemma(df)
        
    df["lemma_misspell_text"] = df["lemmatized_text"].apply(lambda words: [spell(word) for word in words.split()])
    df["lemma_misspell_text"] = df["lemma_misspell_text"].apply(lambda row: " ".join(row))
    return df


tweets_df = add_lemma_misspellings(tweets_df)

display(tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi,shocked hc order handover cbi,shock hc order handover cbi
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan,cm bans plan,cm ban plan
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...,hey everyone harry pancake maple syrup breakfa...,hey everyone harry pancake maple syrup breakfa...


CPU times: user 35.7 s, sys: 0 ns, total: 35.7 s
Wall time: 35.7 s


# Creating a Bag Of Words and Adding New Attributes

In [17]:
%%time
def get_bag_of_words(df: pd.DataFrame) -> list[str]:
    """Method returns a bag of words consisting of all words 
    of the "misspell_text" attribute"""
    bag: list[str] = []
    if "misspell_text" in df.columns:
        words: set[str] = set([token.strip() for tokens in df["misspell_text"].values for token in tokens.split()])
        bag = sorted(words)
    
    return bag


def add_binary_words(df: pd.DataFrame) -> pd.DataFrame:
    """The method creates a new attribute from each word in the word 
    bag and populates it with 1 if the word occurs in the "misspell_text" 
    attribute and 0 if it does not"""
    bag_of_words: list[str] = get_bag_of_words(df)
    for col in df.columns:
        if col in bag_of_words:
            bag_of_words.remove(col)
    
    for word in bag_of_words:
        df[word] = df["misspell_text"].apply(lambda row: 1 if word in row else 0)
        
    return df


binary_tweets_df: pd.DataFrame = tweets_df.copy()
binary_tweets_df = add_binary_words(binary_tweets_df)

display(binary_tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,aa,aaaaaa,aaffaa,...,yoyoyou,yr,yrs,yuri,zabardast,zac,zealand,zero,zoo,zoos
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi,shocked hc order handover cbi,shock hc order handover cbi,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan,cm bans plan,cm ban plan,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...,hey everyone harry pancake maple syrup breakfa...,hey everyone harry pancake maple syrup breakfa...,0,0,0,...,0,1,0,0,0,0,0,0,0,0


CPU times: user 6.34 s, sys: 15.5 ms, total: 6.35 s
Wall time: 6.36 s


In [18]:
%%time
def add_words_count(df: pd.DataFrame) -> pd.DataFrame: 
    """The method creates a new attribute from each word in the word bag and 
    fills it with a number equal to the number of times that the word 
    occurs in the "misspell_text" attribute for this entry"""
    bag_of_words: list[str] = get_bag_of_words(df)
    for col in df.columns:
        if col in bag_of_words:
            bag_of_words.remove(col)
        
    for word in bag_of_words:
        df[word] = df["misspell_text"].apply(lambda row: row.count(word))
        
    return df


count_tweets_df: pd.DataFrame = tweets_df.copy()
count_tweets_df = add_words_count(count_tweets_df)

display(count_tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,aa,aaaaaa,aaffaa,...,yoyoyou,yr,yrs,yuri,zabardast,zac,zealand,zero,zoo,zoos
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi,shocked hc order handover cbi,shock hc order handover cbi,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan,cm bans plan,cm ban plan,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...,hey everyone harry pancake maple syrup breakfa...,hey everyone harry pancake maple syrup breakfa...,0,0,0,...,0,1,0,0,0,0,0,0,0,0


CPU times: user 6.59 s, sys: 67.8 ms, total: 6.66 s
Wall time: 6.65 s


In [19]:
%%time
def add_tfidf(df: pd.DataFrame) -> pd.DataFrame:
    """The method adds a "words_tfidf" column that contains the tfidf representation 
    for the "misspell_text" attribute of this record"""
    tfidf_matrix = tfidf_vect.fit_transform(df["misspell_text"])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                            columns=tfidf_vect.get_feature_names_out(), index=df.index)

    df["words_tfidf"] = tfidf_df.values.tolist()
    return df


tfidf_tweets_df: pd.DataFrame = tweets_df.copy()
tfidf_tweets_df = add_tfidf(tfidf_tweets_df)

display(tfidf_tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,words_tfidf
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi,shocked hc order handover cbi,shock hc order handover cbi,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan,cm bans plan,cm ban plan,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...,hey everyone harry pancake maple syrup breakfa...,hey everyone harry pancake maple syrup breakfa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


CPU times: user 271 ms, sys: 232 ms, total: 503 ms
Wall time: 505 ms


In [20]:
%%time
def add_word2vec(df: pd.DataFrame, vector_size: int = 100, min_count: int = 1, 
                 window: int = 5, epochs: int = 100) -> pd.DataFrame:
    """The method adds a "words_word2vec" column that contains the word2vec representation 
    for the "misspell_text" attribute of this record"""
    
    sentences = [text.split() for text in df["misspell_text"]]
    model = Word2Vec(sentences, vector_size=vector_size, min_count=min_count, 
                     window=window, epochs=epochs)

    word2vec_vectors = []
    for text in sentences:
        if len(text) > 0:
            vector = sum(model.wv[word] for word in text if word in model.wv) / len(text)
        else:
            vector = [0] * vector_size 
        word2vec_vectors.append(vector)

    df["words_word2vec"] = word2vec_vectors
    return df


w2v_tweets_df: pd.DataFrame = tweets_df.copy()
w2v_tweets_df = add_word2vec(w2v_tweets_df)

display(w2v_tweets_df.head(3))

Unnamed: 0,tweets,mood,tokens,stemmed_text,lemmatized_text,misspell_text,lemma_misspell_text,words_word2vec
0,shocked hc order handover cbi,0,"[shocked, hc, order, handover, cbi]",shock hc order handov cbi,shock hc order handover cbi,shocked hc order handover cbi,shock hc order handover cbi,"[0.6803466, 0.7310252, 0.17579159, 0.11130236,..."
1,cm bans paan,0,"[cm, bans, paan]",cm ban paan,cm ban paan,cm bans plan,cm ban plan,"[0.6536197, 1.1341192, 0.44983387, -0.2060052,..."
2,hey everyone harry pancakes maple syrup breakf...,1,"[hey, everyone, harry, pancakes, maple, syrup,...",hey everyon harri pancak mapl syrup breakfast ...,hey everyone harry pancakes maple syrup breakf...,hey everyone harry pancake maple syrup breakfa...,hey everyone harry pancake maple syrup breakfa...,"[-1.2848617, 0.18830332, 0.31929505, -0.090066..."


CPU times: user 1.16 s, sys: 35.6 ms, total: 1.19 s
Wall time: 1.09 s


# Binary DF Prediction

In [21]:
%%time
X: pd.DataFrame = binary_tweets_df.iloc[:, 7:]
y: pd.Series = binary_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pred_results: dict[str, float] = {}

CPU times: user 389 ms, sys: 59.7 ms, total: 448 ms
Wall time: 450 ms


In [22]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=100, max_depth=4)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_bin"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_bin"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_bin"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 299 ms, sys: 23.9 ms, total: 322 ms
Wall time: 322 ms


In [23]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_bin"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_bin"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_bin"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 22.9 s, sys: 28.6 s, total: 51.6 s
Wall time: 2.81 s


In [24]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.7729032258064517
Prediction type rand_for_mae_bin; Value -- 0.2503225806451613
Prediction type rand_for_mrse_bin; Value -- 0.5447698537441175
Prediction type log_reg_accur_bin; Value -- 0.8825806451612903
Prediction type log_reg_mae_bin; Value -- 0.15096774193548387
Prediction type log_reg_mrse_bin; Value -- 0.46697378526961475


# Count DF Prediction

In [25]:
%%time
X: pd.DataFrame = count_tweets_df.iloc[:, 7:]
y: pd.Series = count_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 884 ms, sys: 1.23 s, total: 2.12 s
Wall time: 461 ms


In [26]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=22)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_cnt"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_cnt"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_cnt"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 596 ms, sys: 12.2 ms, total: 608 ms
Wall time: 608 ms


In [27]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_cnt"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_cnt"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_cnt"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 42.1 s, sys: 1min, total: 1min 42s
Wall time: 5.4 s


In [28]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.7729032258064517
Prediction type rand_for_mae_bin; Value -- 0.2503225806451613
Prediction type rand_for_mrse_bin; Value -- 0.5447698537441175
Prediction type log_reg_accur_bin; Value -- 0.8825806451612903
Prediction type log_reg_mae_bin; Value -- 0.15096774193548387
Prediction type log_reg_mrse_bin; Value -- 0.46697378526961475
Prediction type rand_for_accur_cnt; Value -- 0.8374193548387097
Prediction type rand_for_mae_cnt; Value -- 0.17290322580645162
Prediction type rand_for_mrse_cnt; Value -- 0.43994134506405985
Prediction type log_reg_accur_cnt; Value -- 0.8748387096774194
Prediction type log_reg_mae_cnt; Value -- 0.16
Prediction type log_reg_mrse_cnt; Value -- 0.47924672075543584


# TFIDF DF Prediction

In [29]:
%%time
X: pd.DataFrame = tfidf_tweets_df["words_tfidf"].to_list()
y: pd.Series = tfidf_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 127 ms, sys: 288 ms, total: 416 ms
Wall time: 22 ms


In [30]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=22)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_tfidf"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_tfidf"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_tfidf"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 1.71 s, sys: 764 ms, total: 2.47 s
Wall time: 1.46 s


In [31]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_tfidf"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_tfidf"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_tfidf"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 13.8 s, sys: 19.1 s, total: 32.9 s
Wall time: 2.03 s


In [32]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.7729032258064517
Prediction type rand_for_mae_bin; Value -- 0.2503225806451613
Prediction type rand_for_mrse_bin; Value -- 0.5447698537441175
Prediction type log_reg_accur_bin; Value -- 0.8825806451612903
Prediction type log_reg_mae_bin; Value -- 0.15096774193548387
Prediction type log_reg_mrse_bin; Value -- 0.46697378526961475
Prediction type rand_for_accur_cnt; Value -- 0.8374193548387097
Prediction type rand_for_mae_cnt; Value -- 0.17290322580645162
Prediction type rand_for_mrse_cnt; Value -- 0.43994134506405985
Prediction type log_reg_accur_cnt; Value -- 0.8748387096774194
Prediction type log_reg_mae_cnt; Value -- 0.16
Prediction type log_reg_mrse_cnt; Value -- 0.47924672075543584
Prediction type rand_for_accur_tfidf; Value -- 0.8425806451612903
Prediction type rand_for_mae_tfidf; Value -- 0.16
Prediction type rand_for_mrse_tfidf; Value -- 0.4064004064006096
Prediction type log_reg_accur_tfidf; Value -- 0.8619354838709677
Prediction ty

# Word2Vec Prediction

In [33]:
%%time
X: pd.DataFrame = w2v_tweets_df["words_word2vec"].to_list()
y: pd.Series = w2v_tweets_df["mood"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 22.3 ms, sys: 13.3 ms, total: 35.6 ms
Wall time: 1.88 ms


In [34]:
%%time
rand_for: RandomForestClassifier = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=22)

rand_for.fit(X_train, y_train)
y_pred_rf = rand_for.predict(X_test)

pred_results["rand_for_accur_w2v"] = accuracy_score(y_test, y_pred_rf)
pred_results["rand_for_mae_w2v"] = mean_absolute_error(y_test, y_pred_rf)
pred_results["rand_for_mrse_w2v"] = np.sqrt(mean_squared_error(y_test, y_pred_rf))

CPU times: user 1.22 s, sys: 1.13 s, total: 2.35 s
Wall time: 880 ms


In [35]:
%%time
log_reg: LogisticRegression = LogisticRegression(random_state=21, solver="lbfgs", max_iter=500)

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

pred_results["log_reg_accur_w2v"] = accuracy_score(y_test, y_pred_lr)
pred_results["log_reg_mae_w2v"] = mean_absolute_error(y_test, y_pred_lr)
pred_results["log_reg_mrse_w2v"] = np.sqrt(mean_squared_error(y_test, y_pred_lr))

CPU times: user 665 ms, sys: 988 ms, total: 1.65 s
Wall time: 95.5 ms


In [36]:
for pred_type in pred_results:
    print(f"Prediction type {pred_type}; Value -- {pred_results[pred_type]}")

Prediction type rand_for_accur_bin; Value -- 0.7729032258064517
Prediction type rand_for_mae_bin; Value -- 0.2503225806451613
Prediction type rand_for_mrse_bin; Value -- 0.5447698537441175
Prediction type log_reg_accur_bin; Value -- 0.8825806451612903
Prediction type log_reg_mae_bin; Value -- 0.15096774193548387
Prediction type log_reg_mrse_bin; Value -- 0.46697378526961475
Prediction type rand_for_accur_cnt; Value -- 0.8374193548387097
Prediction type rand_for_mae_cnt; Value -- 0.17290322580645162
Prediction type rand_for_mrse_cnt; Value -- 0.43994134506405985
Prediction type log_reg_accur_cnt; Value -- 0.8748387096774194
Prediction type log_reg_mae_cnt; Value -- 0.16
Prediction type log_reg_mrse_cnt; Value -- 0.47924672075543584
Prediction type rand_for_accur_tfidf; Value -- 0.8425806451612903
Prediction type rand_for_mae_tfidf; Value -- 0.16
Prediction type rand_for_mrse_tfidf; Value -- 0.4064004064006096
Prediction type log_reg_accur_tfidf; Value -- 0.8619354838709677
Prediction ty

# Find Most Simmilar Twits

In [38]:
tfidf_matrix = tfidf_vect.fit_transform(tfidf_tweets_df["lemma_misspell_text"])
similarity_matrix: cosine_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

similar_documents_amount: int = 10 
for t, document in enumerate(tfidf_tweets_df["lemma_misspell_text"]):
    similar_indices = similarity_matrix[t].argsort()[: -similar_documents_amount - 1: -1][1: ]
    print(f"Наиболее похожие записи на '{document}':")
    
    for j in similar_indices:
        print(tweets_df.iloc[j]["lemma_misspell_text"])
    
    print()
    if t == similar_documents_amount:
        break

Наиболее похожие записи на 'shock hc order handover cbi':
calcutta hc send case cbi
rahul meet hc give cbi
supreme court term unfortunate ground cite bengal govt plea hc order ask cbi take case
that s shock evil unhappy
order ban
cbi book leader case
sc hear petition challenge bombay hc order beef ban
judge sc bench hear plea mara hc justice order stay transfer early
order car yesterday

Наиболее похожие записи на 'cm ban plan':
chief minister ban plan masala
order ban
sc ban
will not ban
pay hike plan
want fun plan weekend unhappy
bypass highway ban
ban ja hero
plan cap fee

Наиболее похожие записи на 'hey everyone harry pancake maple syrup breakfast great way start day happy miss':
great day everyone happy
good day hey happy
happy everyone happy
way
hey
hey
hey
hey
great day today happy

Наиболее похожие записи на 'welfare':
supreme court say welfare scheme proof
government can not make mandatory extend benefit welfare scheme
agitation delay day trouble soon come
hope louis get time 

* Выше представлен вариант, который учитывает дубликаты постов
* Ниже представлен вариант, который не учитывает дубликаты постов

In [39]:
tfidf_matrix = tfidf_vect.fit_transform(tfidf_tweets_df["lemma_misspell_text"].drop_duplicates(keep="first"))
similarity_matrix: cosine_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

similar_documents_amount: int = 10 
for t, document in enumerate(tfidf_tweets_df["lemma_misspell_text"]):
    similar_indices = similarity_matrix[t].argsort()[: -similar_documents_amount - 1: -1][1: ]
    print(f"Наиболее похожие записи на '{document}':")
    
    for j in similar_indices:
        print(tweets_df.iloc[j]["lemma_misspell_text"])
    
    print()
    if t == similar_documents_amount:
        break

Наиболее похожие записи на 'shock hc order handover cbi':
unhappy friendzone damn sad
quiz industry
bitter percent healthcare bengal
notice anything unusual happy birthday lovely footy happy
buy anything support canada
thank b happy
sudden blaze destroy bungalow
cashswap case
opt

Наиболее похожие записи на 'cm ban plan':
go workout get excited
buy anything support canada
peacefully sit read book work
top high value member week happy
do not
bypass highway ban
memorial get list
dealer say load car return due disabled lose sad
streaminuteg minute understanding thing still unhappy

Наиболее похожие записи на 'hey everyone harry pancake maple syrup breakfast great way start day happy miss':
hundred immigrant arrest routine us enforcement surge
academic siege
appreciate sentiment could not happy
hey
love
always know would like one well haha
thank recent follow happy connect happy great thursday want
indias opaque law
flash light pay parking

Наиболее похожие записи на 'welfare':
want kiss n

# Totals

* В процессе выполнения задания был обработан датафрейм (убраны стоп-слова (точность предсказаний без стоп-слов немного выше, чем с ними), добавлены лемматизация, стемминг, миссемплинг, мешок слов, а также tfidf и word2vec). наилучший результат предсказаний - 0.89 через логистическую регрессию на основе word2vec (результат может немного различаться ввиду случайности выборки)