<a href="https://colab.research.google.com/github/evlko/CS-224W/blob/main/Lab_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Предобработка

In [2]:
import nltk
import re
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('movie_reviews', quiet=True)

True

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

Фунция для удаления слов, которые начинаются с заданного символа:

In [4]:
def remove_words_with_sym(text, symbol='#'):
  return re.sub(r'{}[^\s]*'.format(symbol), '', text)

assert remove_words_with_sym('Hello, it\'s a me, Mario!, #Top') == 'Hello, it\'s a me, Mario!, '
assert remove_words_with_sym('Hello, it\'s a me, @Mario!', '@') == 'Hello, it\'s a me, '

Функция для обработки текста, которая
1. Обращается к `remove_words_with_sym`, чтобы убрать слова, которые начинаются с запрещенных символов;
2. Убирает пунктуацию;
3. Приводит к нижнему регистру;
4. Убирает стоп слова;
5. Лемматизирует.

In [34]:
def preprocess(text, special_symbols=['#', '@', 'http', 'pic twitter c']):
    for special_symbol in special_symbols:
      text = remove_words_with_sym(text, special_symbol)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if lemmatizer.lemmatize(word) not in stop_words] 
    text = ' '.join(text)

    return text

assert preprocess('Hello, it\'s a me, Mario!') == 'hello mario'

In [35]:
# получаем данные
tweets_in = pd.read_csv('https://raw.githubusercontent.com/evlko/CS-224W/main/Data/Twitter/tweets_covid19_IN.csv')
# оставляем только те, у которых "класс" = sad или joy
tweets_in = tweets_in.loc[tweets_in['sentiment'].isin(['sad', 'joy']), ['text', 'sentiment']]
# обрабатываем текст
tweets_in['text'] = tweets_in['text'].apply(preprocess)
# проверяем, что всё ок
tweets_in

Unnamed: 0,text,sentiment
0,agree poor india treated badly poor seek livin...,sad
1,could spent cutie vc sakshi__s n g h coast cro...,joy
2,nature conservation remain priority post coron...,joy
3,coronavirus disappearing italy show intellectu...,sad
4,uk record lowest daily virus death toll since ...,sad
...,...,...
3083,wa tough see go brother excellent 60 day toget...,joy
3085,today 02 30pm 54 year old bangladeshi male adm...,sad
3087,issa date lockdown end inshaallah corona red_h...,joy
3088,death toll due covid 19 rose 31 jammu kashmir ...,sad


# Обучение Модели

In [None]:
!pip install optuna

In [8]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [36]:
X_train, X_test, y_train, y_test = train_test_split(tweets_in['text'], tweets_in['sentiment'], train_size=0.8)

Для настройки гиперпамаетров воспользуемся `optuna` и создадим функцию `objective`, которая будет "обучаться":

In [37]:
def objective(trial):
  vectorizer_name = trial.suggest_categorical('vectorizer', ['CountVectorizer', 'TfidfVectorizer'])
  analyzer = trial.suggest_categorical('analyzer', ['word', 'char', 'char_wb'])
  min_df = trial.suggest_int('min_df', 1, 10, step=1)
  max_df = trial.suggest_float('max_df', 0.5, 1, step=0.05)
  max_features = trial.suggest_int('max_features', 5000, 10000, 500)
  n_grams_max = trial.suggest_int('ngram_range_max', 2, 10, 1) 
  n_grams_min = trial.suggest_int('ngram_range_min', 1, n_grams_max, 1) 

  if vectorizer_name == 'TfidfVectorizer':
    vectorizer = TfidfVectorizer(analyzer=analyzer, max_features=max_features,  max_df=max_df, min_df=min_df, ngram_range=(n_grams_min, n_grams_max))
  else:
    vectorizer = CountVectorizer(analyzer=analyzer, max_features=max_features,  max_df=max_df, min_df=min_df, ngram_range=(n_grams_min, n_grams_max))
  
  model = Pipeline([
           ('vectorizer', vectorizer),
           ('clf', MultinomialNB()),
  ])

  model.fit(X_train, y_train)
  return model.score(X_test, y_test)

Ищем лучшие гиперпараметры

In [38]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=60, n_jobs=1, show_progress_bar=True)

  self._init_valid()


   0%|          | 00:00/01:00

Обучаем финальную модель

In [39]:
model = Pipeline([
           ('clf', MultinomialNB()),
])

model_params = study.best_params
vectorizer = model_params.pop('vectorizer')
model_params['ngram_range'] = (model_params['ngram_range_min'], model_params['ngram_range_max'])
del model_params['ngram_range_min'], model_params['ngram_range_max']

if vectorizer == 'TfidfVectorizer':
  model.steps.insert(0, ['tfidf', TfidfVectorizer()])
  model_params = {f"tfidf__{key}": val for key, val in model_params.items()}
else:
  model.steps.insert(0, ['ngram', CountVectorizer()])
  model_params = {f"ngram__{key}": val for key, val in model_params.items()}

model.set_params(**model_params)
model.fit(X_train, y_train)

# Применение Модели

In [47]:
# получаем данные
tweets_gl = pd.read_csv('https://raw.githubusercontent.com/evlko/CS-224W/main/Data/Twitter/tweets_covid19_GL.csv')
# оставляем только нужные данные
tweets_gl = tweets_gl[['text']]
# обрабатываем текст
tweets_gl['text'] = tweets_gl['text'].apply(preprocess)

In [41]:
predictions = model.predict(tweets_gl['text'])

tweets_gl['class'] = predictions

tweets_gl['class'].value_counts(normalize=True)

sad    0.605501
joy    0.394499
Name: class, dtype: float64

# Анализ Тональности

In [42]:
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

In [49]:
tb = Blobber(analyzer=NaiveBayesAnalyzer())

tweets_gl['sentiment'] = tweets_gl['text'].apply(lambda x: tb(x).sentiment.classification)

tweets_gl['sentiment'].value_counts(normalize=True)

pos    0.69845
neg    0.30155
Name: sentiment, dtype: float64