In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Загрузка данных
df = pd.read_csv('comments.csv')
df

In [None]:
# Визуализация распределения значений предикторов
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
df['Likes'].plot(kind='hist', bins=2, ax=axes[0], title='Likes Distribution')
df['Sentiment'].plot(kind='hist', ax=axes[1], title='Sentiment Distribution')
plt.show()

In [None]:
# Векторизация текста с использованием Word2Vec
comments = [str(comment).split() for comment in df['Comment']]
w2v_model = Word2Vec(sentences=comments, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
w2v_vectors = [w2v_model.wv[comment].mean(axis=0) for comment in comments]
w2v_vectors

In [None]:
# Добавление векторов к данным
w2v_df = pd.DataFrame(w2v_vectors, columns=[f'w2v_{i}' for i in range(100)])
df = pd.concat([df, w2v_df], axis=1)

In [None]:
df_cleared = df.drop(["Unnamed: 0", "Video ID", "Comment"], axis=1)

In [None]:
# Разделение данных на обучающий и тестовый наборы
X = df_cleared.drop(['Sentiment'], axis=1)
y = df_cleared['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Стандартизация данных перед применением PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Применение PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled)

# Создание DataFrame для визуализации
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['Sentiment'] = y_train.values

# Визуализация PCA пространства с окраской по Sentiment
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Sentiment', data=pca_df, palette='viridis', s=50, alpha=0.8)
plt.title('PCA of w2v Embeddings with Sentiment Coloring')
plt.legend(title='Sentiment')
plt.show()

In [None]:
pca_df = pca_df[pca_df['Sentiment'] != 2.0]
# Визуализация PCA пространства с окраской по Sentiment
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Sentiment', data=pca_df, palette='viridis', s=50, alpha=0.8)
plt.title('PCA of w2v Embeddings with Sentiment Coloring without neutral')
plt.legend(title='Sentiment')
plt.show()