## **Processamento de Linguagem Natural**

O dataset do Youtube trends possui variáveis textuais contendo informações promissoras que podem ajudar os modelos regressores ou classificadores.

Entretanto, o texto precisa ser adequadamente limpo e processado. Utilizaremos muito as técnicas de NLP e regex. 




In [0]:
import pandas as pd
import numpy as np
from re import sub

from numpy import asarray
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer 
from nltk import download

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from warnings import filterwarnings

filterwarnings('ignore')
download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Limpeza de dados + Engenharia de Atributos...

def remove_stopwords_and_normalize(doc_text, stopwords_hash):
  content = []
  stemmer = PorterStemmer()
  for word in doc_text:
    word_clean = word.lower().strip()
    if(stopwords_hash.get(word_clean) == None):
      word_clean = stemmer.stem(word_clean)
      content.append(word_clean)  
  return content


def tokenizer(text):
  token_list = []
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(text)
  for word in tokens:
    if word not in token_list:
      token_list.append(word)
  return token_list


def data_cleaning(news_list, target_list):
  X_clean, Y_clean = [], []
  stopwords_dict = {word:0 for word in stopwords.words('english')}
  for idx, news in enumerate(news_list):
    text = sub(r'[^\w\s]', ' ', news)
    text = sub(r'[^\D]', ' ', text)
    text = tokenizer(text)
    text = remove_stopwords_and_normalize(text, stopwords_dict)
    text = ' '.join(text).strip()
    if(len(text) > 0):
      X_clean.append(text)
      Y_clean.append(target_list[idx])
  return X_clean, Y_clean


In [0]:
df = pd.read_csv('sentiments.csv')
df.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'description', 'likes_perc', 'dislikes_perc', 'comment_perc', 'emotion',
       'pop'],
      dtype='object')

In [0]:
df2 = df[['tags', 'pop']].copy()

df2.dropna(inplace=True)
df2.reset_index(drop=True, inplace=True)
df2.shape

(6334, 2)

In [0]:
df_text = df2.tags.copy()

In [0]:
df_target = df2[['pop']].copy()

In [0]:
X, y = data_cleaning(df_text, np.squeeze(df_target))

In [0]:
# Engenharia de Atributos + Classificação de Texto....

#from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

X, y = asarray(X), asarray(y)

kfold = StratifiedKFold(n_splits=7, random_state=42, shuffle=True)

iteration = 1
for train_index, test_index in kfold.split(X, y):

    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    # print(X_train.shape, X_test.shape)
    
    ##############################################
    # Class balancing

    X2 = pd.concat([pd.Series(X_train), pd.Series(Y_train)], axis=1)
    X2.columns = ['tags', 'pop']

    # separate minority and majority classes
    not_pop = X2[X2['pop']==0]
    pop = X2[X2['pop']==1]

    # upsample minority
    pop_upsampled = resample(pop,
                          replace=True, # sample with replacement
                          n_samples= len(not_pop), # match number in majority class
                          random_state=27) # reproducible results

    # combine majority and upsampled minority
    upsampled = pd.concat([not_pop, pop_upsampled])

    Y_train = upsampled['pop']
    X_train = upsampled['tags']

    #################################################
    vectorizer = TfidfVectorizer(use_idf=True, ngram_range = (1,1),\
                     min_df = 5, max_df = 0.8) # 5 0.7

    X_train = vectorizer.fit_transform(X_train) 
    X_test  = vectorizer.transform(X_test)

    classifier = RandomForestClassifier(random_state=5) #min_impurity_decrease=1e-6,
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(X_test)
    
    print(f'Fold: {iteration}')
    print(classification_report(Y_test, predictions, target_names=['less 1M','break 1M']),'\n\n')
    #print(classification_report(Y_test, predictions, target_names=['loved','hated', 'polemic', 'neutral']),'\n\n')

    iteration+=1

Fold: 1
              precision    recall  f1-score   support

     less 1M       0.75      0.89      0.81       596
    break 1M       0.66      0.42      0.51       310

    accuracy                           0.73       906
   macro avg       0.70      0.65      0.66       906
weighted avg       0.72      0.73      0.71       906
 


Fold: 2
              precision    recall  f1-score   support

     less 1M       0.75      0.85      0.80       596
    break 1M       0.61      0.46      0.53       310

    accuracy                           0.72       906
   macro avg       0.68      0.66      0.66       906
weighted avg       0.70      0.72      0.71       906
 


Fold: 3
              precision    recall  f1-score   support

     less 1M       0.75      0.85      0.80       596
    break 1M       0.61      0.44      0.51       309

    accuracy                           0.71       905
   macro avg       0.68      0.65      0.65       905
weighted avg       0.70      0.71      0.70 