In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display, Markdown
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from random import random

<h1>1. Подготовить данные для построения классификационных моделей.</h1>

In [3]:
df = pd.read_csv('Emotion_classify_Data.csv').rename(columns={'Comment': 'text', 'Emotion': 'sentiment'})
df

Unnamed: 0,text,sentiment
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


<h3>Распределение по классам в датасете</h3>

In [4]:
df['sentiment'].value_counts()

sentiment
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [6]:
vectorizer = CountVectorizer(dtype=bool, lowercase=False)
text = df['text']
X = vectorizer.fit_transform(text)
features = vectorizer.get_feature_names_out()


binarised_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
binarised_df['sentiment'] = df['sentiment']
filtered_columns = [col for col in binarised_df.columns if col.isalpha()]
binarised_df = binarised_df[filtered_columns]
binarised_df

Unnamed: 0,aa,aac,aaron,ab,abandon,abandoned,abandonment,abbigail,abc,abdomen,...,zero,zest,zhu,zipline,zombies,zone,zonisamide,zq,zumba,sentiment
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,anger
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,joy
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5932,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
5933,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,anger
5934,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,joy
5935,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear


<h1>2. Все слова в одном регистре</h1>

In [7]:
label_encoder = LabelEncoder()
binarised_df['labels'] = label_encoder.fit_transform(df['sentiment'])
full_data = binarised_df
X=full_data.drop(["sentiment","labels"], axis =1)
Y=full_data.labels
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.2, random_state=10)
X_train

Unnamed: 0,aa,aac,aaron,ab,abandon,abandoned,abandonment,abbigail,abc,abdomen,...,zendikar,zero,zest,zhu,zipline,zombies,zone,zonisamide,zq,zumba
4984,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4270,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2936,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5297,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4526,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3441,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1344,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4623,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


<h3>Случайный лес</h3>

In [8]:
def random_forest(X_train, Y_train, X_test, Y_test):
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    display(Markdown(f"<h4>Результат на тестовой выборке {sum(predictions==Y_test)} из {len(Y_test)}</h4>"))
    display(pd.Series(model.feature_importances_, index=X_train.columns))

random_forest(X_train, Y_train, X_test, Y_test)

<h4>Результат на тестовой выборке 1099 из 1188</h4>

aa            0.000021
aac           0.000012
aaron         0.000014
ab            0.000000
abandon       0.000018
                ...   
zombies       0.000014
zone          0.000014
zonisamide    0.000011
zq            0.000000
zumba         0.000003
Length: 8953, dtype: float64

<h3>Логистическая регрессия</h3>

In [9]:
def log_regression(X_train, Y_train, X_test, Y_test):
    model = LogisticRegression(max_iter=300)
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    display(Markdown(f"<h4>Результат на тестовой выборке {sum(predictions==Y_test)} из {len(Y_test)}</h4>"))
    display(model.coef_)

log_regression(X_train, Y_train, X_test, Y_test)

<h4>Результат на тестовой выборке 1124 из 1188</h4>

array([[ 0.07029255,  0.00298238,  0.03433187, ..., -0.02561751,
         0.01859635, -0.02246681],
       [-0.02394323, -0.00249097, -0.00755255, ..., -0.04905797,
        -0.01283668, -0.01453954],
       [-0.04634932, -0.00049142, -0.02677932, ...,  0.07467548,
        -0.00575966,  0.03700636]])

<h3>Наивный байесовский классификатор</h3>

In [10]:
def naive_bayesian(X_train, Y_train, X_test, Y_test):
    model = GaussianNB()
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    display(Markdown(f"<h4>Результат на тестовой выборке {sum(predictions==Y_test)} из {len(Y_test)}</h4>"))

naive_bayesian(X_train, Y_train, X_test, Y_test)

<h4>Результат на тестовой выборке 682 из 1188</h4>

<h3>Метод опорных векторов</h3>

In [11]:
def support_vectors(X_train, Y_train, X_test, Y_test):
    model = svm.SVC(kernel='linear')
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    display(Markdown(f"<h4>Результат на тестовой выборке {sum(predictions==Y_test)} из {len(Y_test)}</h4>"))
    display(model.coef_)

support_vectors(X_train, Y_train, X_test, Y_test)

<h4>Результат на тестовой выборке 1124 из 1188</h4>

array([[ 0.17987637,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.04069072,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.00228886,
         0.        ,  0.        ]])

<h3>K ближайших соседей</h3>

In [12]:
def k_nearest_neighbours(X_train, Y_train, X_test, Y_test):
    model = KNeighborsClassifier(n_neighbors=4)
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    display(Markdown(f"<h4>Результат на тестовой выборке {sum(predictions==Y_test)} из {len(Y_test)}</h4>"))

k_nearest_neighbours(X_train, Y_train, X_test, Y_test)

<h4>Результат на тестовой выборке 635 из 1188</h4>

<h1>3. Слова в разных регистрах</h1>

In [12]:
any_case_data = df.copy()
any_case_data['upper_text'] = any_case_data['text'].transform(lambda x: ' '.join([word.capitalize() if random()<0.1 else word for word in x.split(' ')]))
vectorizer = CountVectorizer(dtype=bool, lowercase=False)
same_case_text = any_case_data.upper_text
X = vectorizer.fit_transform(same_case_text)

any_case_binarised_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
any_case_binarised_df['sentiment'] = any_case_data['sentiment']
any_case_binarised_df

Unnamed: 0,Abandoned,Abandonment,Abilities,Ability,Able,About,Absolutely,Absurdity,Acceptance,Accepted,...,zendikar,zero,zhu,zipline,zombies,zone,zonisamide,zq,zumba,sentiment
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,anger
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,joy
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5932,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear
5933,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,anger
5934,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,joy
5935,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,fear


In [13]:
X=any_case_binarised_df.drop(["sentiment"], axis =1)
Y=any_case_binarised_df.sentiment
X_train_anycase, X_test_anycase, Y_train_anycase, Y_test_anycase = train_test_split(X,Y, test_size=0.2, random_state=10)

<h3>Случайный лес</h3>

In [14]:
random_forest(X_train_anycase, Y_train_anycase, X_test_anycase, Y_test_anycase)

<h4>Результат на тестовой выборке 1048 из 1188</h4>

Abandoned      0.000085
Abandonment    0.000024
Abilities      0.000011
Ability        0.000004
Able           0.000122
                 ...   
zombies        0.000017
zone           0.000004
zonisamide     0.000004
zq             0.000007
zumba          0.000004
Length: 10753, dtype: float64

<h3>Логистическая регрессия</h3>

In [15]:
log_regression(X_train_anycase, Y_train_anycase, X_test_anycase, Y_test_anycase)

<h4>Результат на тестовой выборке 1092 из 1188</h4>

array([[-0.15230996,  0.20849644, -0.06866797, ..., -0.0281314 ,
         0.01673276, -0.03809913],
       [ 0.28982546, -0.09025998, -0.02755846, ..., -0.06037924,
        -0.01159191, -0.02386792],
       [-0.13751549, -0.11823646,  0.09622643, ...,  0.08851064,
        -0.00514085,  0.06196705]])

<h3>Наивный байесовский классификатор</h3>

In [16]:
naive_bayesian(X_train_anycase, Y_train_anycase, X_test_anycase, Y_test_anycase)

<h4>Результат на тестовой выборке 699 из 1188</h4>

<h3>Метод опорных векторов</h3>

In [17]:
support_vectors(X_train_anycase, Y_train_anycase, X_test_anycase, Y_test_anycase)

<h3>K ближайших соседей</h3>

In [1]:
k_nearest_neighbours(X_train_anycase, Y_train_anycase, X_test_anycase, Y_test_anycase)

NameError: name 'k_nearest_neighbours' is not defined

<h1>4. Удаление стоп-слов (общеупотребимых слов, которые встречаются в любых сообщениях)</h1>

In [14]:
words_count = binarised_df.groupby('sentiment').sum()
total_words = words_count.sum().sum()
popular_words = [column for column in words_count.columns if words_count[column].sum()/total_words>0.005]
words_count[popular_words]

Unnamed: 0_level_0,about,am,and,be,but,feel,feeling,for,have,im,...,of,on,so,that,the,this,to,was,when,with
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
anger,191,222,854,200,288,1271,647,252,229,280,...,464,163,291,524,687,245,770,252,179,279
fear,303,244,827,158,289,1156,725,182,192,286,...,504,164,258,427,687,224,779,259,183,177
joy,184,251,916,251,244,1410,540,280,278,264,...,493,191,285,547,791,238,867,203,137,249


In [None]:
no_stop_words = binarised_df.drop(columns=popular_words)
X=no_stop_words.drop(["sentiment"], axis =1)
Y=no_stop_words.sentiment
X_train_no_stop_words, X_test_no_stop_words, Y_train_no_stop_words, Y_test_no_stop_words = train_test_split(X,Y, test_size=0.2, random_state=10)

<h3>Случайный лес</h3>

In [None]:
random_forest(X_train_no_stop_words, Y_train_no_stop_words, X_test_no_stop_words, Y_test_no_stop_words)

<h4>Результат на тестовой выборке 1111 из 1188</h4>

aa            3.775917e-05
aac           9.457826e-06
aaron         1.148230e-05
ab            8.803381e-07
abandon       2.773062e-06
                  ...     
zombies       1.409970e-05
zone          2.349566e-05
zonisamide    1.301439e-05
zq            2.310445e-06
zumba         4.667077e-07
Length: 8925, dtype: float64

<h3>Логистическая регрессия</h3>

In [None]:
log_regression(X_train_no_stop_words, Y_train_no_stop_words, X_test_no_stop_words, Y_test_no_stop_words)

<h4>Результат на тестовой выборке 1116 из 1188</h4>

array([[ 0.07071827,  0.0034908 ,  0.03741188, ..., -0.03752156,
         0.01897299, -0.0245887 ],
       [ 0.00300778, -0.00281529, -0.00967599, ..., -0.06105751,
        -0.01254145, -0.01583556],
       [-0.07372605, -0.00067551, -0.02773589, ...,  0.09857907,
        -0.00643154,  0.04042426]])

<h3>Наивный байесовский классификатор</h3>

In [None]:
naive_bayesian(X_train_no_stop_words, Y_train_no_stop_words, X_test_no_stop_words, Y_test_no_stop_words)

<h4>Результат на тестовой выборке 682 из 1188</h4>

<h3>Метод опорных векторов</h3>

In [None]:
support_vectors(X_train_no_stop_words, Y_train_no_stop_words, X_test_no_stop_words, Y_test_no_stop_words)

<h4>Результат на тестовой выборке 1117 из 1188</h4>

array([[ 0.14456217,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05032736,  0.        ,  0.        , ..., -0.04313864,
         0.        , -0.01064536],
       [ 0.        ,  0.        ,  0.        , ..., -0.00350565,
         0.        ,  0.        ]])

<h3>K ближайших соседей</h3>

In [None]:
k_nearest_neighbours(X_train_no_stop_words, Y_train_no_stop_words, X_test_no_stop_words, Y_test_no_stop_words)

<h4>Результат на тестовой выборке 755 из 1188</h4>

<h1>5. Выделение только специфических слов, которые характерны для того или иного класса</h1>

In [15]:
X=binarised_df.drop(["sentiment"], axis =1)
Y=binarised_df.sentiment
X_train_inner_popular, X_test_inner_popular, Y_train_inner_popular, Y_test_inner_popular = train_test_split(X,Y, test_size=0.2, random_state=10)

anger_popular_words = [column for column in X_train_inner_popular.columns if words_count.loc['anger'][column]/words_count[column].sum()>0.5]
fear_popular_words = [column for column in X_train_inner_popular.columns if words_count.loc['fear'][column]/words_count[column].sum()>0.5]
joy_popular_words = [column for column in X_train_inner_popular.columns if words_count.loc['joy'][column]/words_count[column].sum()>0.5]
display(anger_popular_words)
display(fear_popular_words)
display(joy_popular_words)

['aac',
 'aaron',
 'abandonment',
 'abide',
 'absolute',
 'abuses',
 'abusive',
 'abyss',
 'academia',
 'accelerated',
 'accentuating',
 'accept',
 'accepts',
 'access',
 'acctually',
 'accumulation',
 'accurate',
 'acquainted',
 'acted',
 'actions',
 'actively',
 'activist',
 'acts',
 'actual',
 'actuality',
 'adaption',
 'addicted',
 'addictive',
 'adequately',
 'administration',
 'admittedly',
 'advantage',
 'advocates',
 'affair',
 'affeccion',
 'affect',
 'affects',
 'afrade',
 'african',
 'aftermath',
 'against',
 'aggravated',
 'aggression',
 'agiatated',
 'agitated',
 'agony',
 'agree',
 'agreed',
 'agreement',
 'ah',
 'ahaha',
 'ailments',
 'aimed',
 'airritated',
 'akward',
 'albeit',
 'alcohol',
 'alexis',
 'alice',
 'aligncenter',
 'allocated',
 'alot',
 'amish',
 'ampatuan',
 'anal',
 'ancients',
 'andreas',
 'andy',
 'anger',
 'angered',
 'angie',
 'angry',
 'angst',
 'animosity',
 'anipike',
 'ankle',
 'anne',
 'annoyed',
 'annoying',
 'annual',
 'annulment',
 'anonymous

['abbigail',
 'abdomen',
 'abnormally',
 'abruptly',
 'academic',
 'accepting',
 'accident',
 'acclimated',
 'accomplishments',
 'according',
 'accounted',
 'accustomed',
 'achievements',
 'aching',
 'acne',
 'acronym',
 'actresses',
 'acutely',
 'add',
 'addressing',
 'adds',
 'admin',
 'admitted',
 'admitting',
 'adn',
 'adopt',
 'adoption',
 'adress',
 'advocate',
 'affecting',
 'afford',
 'afp',
 'afraid',
 'aged',
 'ahkman',
 'aimlessly',
 'airtime',
 'alarm',
 'alarmed',
 'albino',
 'alignment',
 'allah',
 'alliance',
 'aloof',
 'alternated',
 'alway',
 'amazingness',
 'among',
 'amounts',
 'andintrupte',
 'angeles',
 'angsty',
 'anniversary',
 'annoyance',
 'antagonism',
 'antics',
 'anxieties',
 'anxious',
 'aout',
 'apartment',
 'apawa',
 'apaya',
 'apgujeong',
 'applying',
 'apprehensive',
 'approaching',
 'apraxia',
 'apt',
 'arabs',
 'area',
 'arena',
 'arm',
 'armed',
 'armistice',
 'arrangment',
 'arrive',
 'article',
 'aryiku',
 'ashers',
 'asian',
 'aspects',
 'ass',
 '

['ab',
 'abc',
 'abelard',
 'abilities',
 'abound',
 'above',
 'abroad',
 'absence',
 'absolutly',
 'absorbed',
 'absurdity',
 'academics',
 'accent',
 'acceptable',
 'accepted',
 'accommodation',
 'accompany',
 'accomplished',
 'accomplishing',
 'accomplishment',
 'accordance',
 'accumulate',
 'accumulated',
 'acheivment',
 'achieved',
 'achieving',
 'acknowledgment',
 'aconfident',
 'activism',
 'activity',
 'actor',
 'actors',
 'adapted',
 'address',
 'adherence',
 'adjust',
 'adjustment',
 'administrator',
 'administrators',
 'admired',
 'admires',
 'admission',
 'admittance',
 'adorable',
 'advancement',
 'advances',
 'adventures',
 'adventurous',
 'advise',
 'aesthetics',
 'affections',
 'affend',
 'afield',
 'afternoon',
 'afterward',
 'agenda',
 'agent',
 'agnostics',
 'ahahahaha',
 'aircleaner',
 'airports',
 'alba',
 'album',
 'alcest',
 'alert',
 'ali',
 'alight',
 'alise',
 'alive',
 'allergies',
 'allies',
 'allthingsbucks',
 'aloft',
 'along',
 'alongside',
 'alphabet',
 

In [16]:
only_inner_popular_words = []
only_inner_popular_words.extend(anger_popular_words)
only_inner_popular_words.extend(fear_popular_words)
only_inner_popular_words.extend(joy_popular_words)
X_train_inner_popular = X_train_inner_popular[only_inner_popular_words]
X_test_inner_popular = X_test_inner_popular[only_inner_popular_words]

<h3>Случайный лес</h3>

In [17]:
random_forest(X_train_inner_popular, Y_train_inner_popular, X_test_inner_popular, Y_test_inner_popular)

<h4>Результат на тестовой выборке 1187 из 1188</h4>

aac            6.973996e-07
aaron          6.639124e-07
abandonment    5.291696e-07
abide          6.556306e-06
absolute       0.000000e+00
                   ...     
zendikar       0.000000e+00
zipline        3.081721e-05
zone           8.237677e-06
zonisamide     0.000000e+00
zumba          4.144925e-06
Length: 6820, dtype: float64

<h3>Логистическая регрессия</h3>

In [18]:
log_regression(X_train_inner_popular, Y_train_inner_popular, X_test_inner_popular, Y_test_inner_popular)

<h4>Результат на тестовой выборке 1188 из 1188</h4>

array([[ 3.62796561e-03,  5.54238188e-03,  7.03166130e-03, ...,
        -4.32479037e-03,  6.80133139e-08,  6.77822100e-08],
       [-3.62841239e-03, -5.54286088e-03, -7.03215445e-03, ...,
        -4.00055276e-03, -5.82610612e-03, -3.74347186e-03],
       [ 4.46784869e-07,  4.78994882e-07,  4.93157174e-07, ...,
         8.32534313e-03,  5.82603810e-03,  3.74340408e-03]])

<h3>Наивный байесовский классификатор</h3>

In [19]:
naive_bayesian(X_train_inner_popular, Y_train_inner_popular, X_test_inner_popular, Y_test_inner_popular)

<h4>Результат на тестовой выборке 1163 из 1188</h4>

<h3>Метод опорных векторов</h3>

In [20]:
support_vectors(X_train_inner_popular, Y_train_inner_popular, X_test_inner_popular, Y_test_inner_popular)

<h4>Результат на тестовой выборке 1188 из 1188</h4>

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.00035486, 0.        ,
        0.        ]])

<h3>K ближайших соседей</h3>

In [21]:
k_nearest_neighbours(X_train_inner_popular, Y_train_inner_popular, X_test_inner_popular, Y_test_inner_popular)

<h4>Результат на тестовой выборке 1187 из 1188</h4>