In [None]:
#importando bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Upload do dataset
from google.colab import files
upload = files.upload()

Saving cyberbullying_tweets.csv to cyberbullying_tweets.csv


In [None]:
dataset = pd.read_csv("cyberbullying_tweets.csv")
dataset.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [None]:
#Obtendo os valores de categoria
dataset["cyberbullying_type"].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [None]:
#formato dos dados (Linhas e colunas)
dataset.shape

(47692, 2)

In [None]:
#distribuição de documentos por classe
dataset["cyberbullying_type"].value_counts()

religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

In [None]:
#Separando os valores em documentos e classes
documentos = dataset["tweet_text"]
classes = dataset["cyberbullying_type"]

In [None]:
import spacy
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

!python -m spacy download en_core_web_sm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
nlp_spacy = spacy.load('en_core_web_sm')

def limpar_texto(texto):
  #separar as palavras
  tokens = word_tokenize(texto)

  #remover pontuação
  tokens_sem_pontuacao = [p for p in tokens if p not in string.punctuation]

  #remover as stopwords
  stops = stopwords.words("english")
  tokens_sem_stopwords = [p.lower() for p in tokens_sem_pontuacao if p.lower() not in stops]

  conjunto_token = " ".join(tokens_sem_stopwords)

  #Lemma
  obj = nlp_spacy(conjunto_token)
  tokens_lema = [p.lemma_ for p in obj]

  res = " ".join(tokens_lema)
  return res

In [None]:
documentos = [limpar_texto(p) for p in documentos]

In [None]:
#Extraindo as features dos textos - Matriz TF-IDF
vetorizador = TfidfVectorizer()
features = vetorizador.fit_transform(documentos)

In [None]:
#Formato da matriz de features
features.shape

(47692, 54623)

In [None]:
#Rótulos das colunas
print(vetorizador.get_feature_names_out()[1:10])

['000' '0000841420' '000s' '0021' '006' '007' '007alisohrab' '007luis'
 '00am']


In [None]:
#dividindo os dados em treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(features, classes, test_size=0.3)

In [None]:
#Treinando o modelo
random_forest = RandomForestClassifier()
random_forest.fit(x_treino, y_treino)

In [None]:
#Realizando a predição das classes do conjunto de teste
previsao = random_forest.predict(x_teste)

In [None]:
#Visualizando as previsões
print(previsao)

['age' 'ethnicity' 'gender' ... 'gender' 'not_cyberbullying'
 'other_cyberbullying']


In [None]:
#matriz de confusão
print(confusion_matrix(y_teste, previsao))

[[2297    3    3   24   30    2]
 [   2 2425    2    2   21    4]
 [   4    4 1954  177  209    7]
 [  36   15   63 1160 1010  104]
 [  18   10  107  551 1647    7]
 [   0    3    5   33   28 2341]]


In [None]:
#Acurácia do modelo
print(accuracy_score(y_teste, previsao))

0.8263908303047246


In [None]:
#Apresentando as métricas do modelo
print(metrics.classification_report(y_teste, previsao))

                     precision    recall  f1-score   support

                age       0.97      0.97      0.97      2359
          ethnicity       0.99      0.99      0.99      2456
             gender       0.92      0.83      0.87      2355
  not_cyberbullying       0.60      0.49      0.54      2388
other_cyberbullying       0.56      0.70      0.62      2340
           religion       0.95      0.97      0.96      2410

           accuracy                           0.83     14308
          macro avg       0.83      0.83      0.83     14308
       weighted avg       0.83      0.83      0.83     14308



In [None]:

mensagem = ["To earn £2.50 of free call credit and details of great deals, reply OK to this\ text with your full name, house number and postcode"]
#mensagem = ["Hello, I would like to invite you on a long journey. If you accept, \ send me your personal data"]
mensagem = vetorizador.transform([limpar_texto(mensagem[0])])
mensagem.shape

(1, 54623)

In [None]:
#Predição da classe
predicao = random_forest.predict(mensagem)
print(predicao)

['other_cyberbullying']
