# Importação das bibliotecas utilizadas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import io

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Pre-processamento dos dados

In [None]:
# Para a leitura do arquivo, é necessário tê-lo presente em seu Google Drive
# O arquivo para download pode ser encontrado aqui: https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification
df = pd.read_csv("/content/drive/MyDrive/a/cyberbullying_tweets.csv")

In [None]:
df['tweet_text'][34]

"I think @bxokrissy third period teacher doesn't like me he always tells me to go to class when I walk u to class"

In [None]:
vetorizador = CountVectorizer(min_df=10)

In [None]:
matriz_palavras = vetorizador.fit_transform(df['tweet_text'])

In [None]:
df_palavras = pd.DataFrame(matriz_palavras.toarray(), columns=vetorizador.get_feature_names_out())

In [None]:
final_df = df_palavras
final_df['cyberbullying_classification'] = df['cyberbullying_type']
final_df['cyberbullying_classification'] = final_df['cyberbullying_classification'].apply(lambda x: 1 if x != 'not_cyberbullying' else 0)
final_df = final_df > 0

# Criação de um novo dataset e treinamento do algoritmo

In [None]:
df_both = final_df
y = df_both['cyberbullying_classification']
df_both.drop(['cyberbullying_classification'], axis=1, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_both, y, test_size=0.25)

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predição com os dados de treino e teste

In [None]:
test_pred = clf.predict(X_test)
print(test_pred)

[ True  True  True ...  True  True  True]


In [None]:
train_pred = clf.predict(X_train)
print(train_pred)

[ True  True  True ... False  True  True]


# Métricas e comparações dos resultaddos obtidos

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, test_pred) * 100
print('Acurácia do modelo: {0:0.2f}%'.format(accuracy))

Acurácia do modelo: 83.32%


In [None]:
results = {'Test Real Value': y_test, 'Test Prediction': test_pred}
df_results = pd.DataFrame(data=results)
df_results

Unnamed: 0,Test Real Value,Test Prediction
12961,True,True
8332,True,True
36154,True,True
42013,True,True
8588,True,True
...,...,...
27303,True,True
25460,True,False
47011,True,True
42346,True,True


In [None]:
results = {'Train Real Value': y_train, 'Train Prediction': train_pred}
df_results = pd.DataFrame(data=results)
df_results

Unnamed: 0,Train Real Value,Train Prediction
36264,True,True
14935,True,True
46949,True,True
39648,True,True
35893,True,True
...,...,...
41045,True,True
9330,True,False
2922,False,False
9344,True,True


# Teste com novos dados reais

In [None]:
commentaries = [
    "RT @femfreq: Let me spell it out. It is deeply misogynist to propagate wild conspiracy theories suggesting women in gaming fake death or ra…",
    "Quaazzzyyy some females won’t fuck wimme cause of who i call my bruddas I switch for no bitch #4L on my way too more bread",
    "mkr Promo girls are going to BRING IT in the next round. Yeah! Bring it! Bring your store bought capsicums!",
    "You are so idiots that you the Muslim fascists don't really know that Ayasofya is Saint Sophia.Αγία Σοφία.Greek and not Turkish.Christian and not MuslimAt least you can call otherwise and not Ayasofya.Every body is laughing with you.The end is approaching! The lira is collapsing",
    "I think @bxokrissy third period teacher hates me he always tells me to go to class when I walk u to class",
    "&amp;&amp; he talking loud Af!",
    "Just tried a new restaurant, and the food was a major disappointment. Overpriced and tasteless. Save your money and taste buds, folks. #FoodFail",
    "Can't believe the audacity of some influencers. All they do is promote products and pretend their lives are perfect. Wake up, people! Social media isn't reality. #FakeInfluencers",
    "Another day, another internet troll spreading hate. Seriously, when will these keyboard warriors find a better hobby? Let's spread kindness, not negativity. #PositiveVibesOnly",
    "I'm sorry, but if you can't differentiate 'your' from 'you're,' maybe you shouldn't be posting. Just saying. #GrammarMatters"
]

data = pd.DataFrame(commentaries, columns=["Message"])

In [None]:
data['processed_text'] = data['Message'].apply(lambda x: ' '.join(re.findall(r'\b[A-Za-z]+\b', x)))

In [None]:
vetorizador = CountVectorizer()

In [None]:
matriz_palavras_data = vetorizador.fit_transform(data['Message'])

In [None]:
df_data_palavras = pd.DataFrame(matriz_palavras_data.toarray(), columns=vetorizador.get_feature_names_out())

In [None]:
new_df_data_palavras = df_palavras
X = len(df_data_palavras)
total_rows = new_df_data_palavras.shape[0]
new_df_data_palavras = new_df_data_palavras.iloc[total_rows:]

In [None]:
columns = new_df_data_palavras.columns
newColumns = df_data_palavras.columns
for column in columns:
  if column in newColumns:
    new_df_data_palavras[column] = df_data_palavras[column]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_data_palavras[column] = df_data_palavras[column]


In [None]:
new_df_data_palavras.fillna(0, inplace=True)
new_df_data_palavras = new_df_data_palavras > 0
new_df_data_palavras = new_df_data_palavras.drop('cyberbullying_classification', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_data_palavras.fillna(0, inplace=True)


In [None]:
data_test_pred = clf.predict(new_df_data_palavras)
print(data_test_pred)

[False  True False  True  True  True False  True  True  True]
