In [None]:
from google.colab import drive
drive.mount('/content/project')

In [None]:
import pandas as pd
import numpy as np
import re
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
# Gensim
import gensim
from gensim.utils import simple_preprocess
# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
from wordcloud import WordCloud
#Spacy parser
import spacy
nlp = spacy.load("en_core_web_sm")

import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

  import pandas.util.testing as tm


## Load data

In [None]:
hs = pd.read_csv("/content/project/My Drive/Doutorado/2020.1/Mineração de dados/ProjetoMD/Datasets/model_data.csv")

mlma = pd.read_csv("/content/project/My Drive/Doutorado/2020.1/Mineração de dados/ProjetoMD/Datasets/mlma_dataset.csv")

hasoc = pd.read_csv("/content/project/My Drive/Doutorado/2020.1/Mineração de dados/ProjetoMD/Datasets/hasoc2019_data.csv",delimiter="\t")

df = pd.concat([hs,mlma, hasoc], axis=0, ignore_index=True)
df.info()

## Drop Duplicates

In [None]:
df.drop_duplicates(subset=['text'], keep='first',inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10321 entries, 0 to 10500
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10321 non-null  object
 1   text        10321 non-null  object
 2   label       10321 non-null  object
dtypes: object(3)
memory usage: 322.5+ KB


## Descriptive statistics

In [None]:
count = df['text'].str.split().str.len()
count.index = count.index.astype(str) + ' words:'
print("Total number of words:", count.sum(), "words")


NameError: ignored

In [None]:
print("Mean number of words per tweet:", round(count.mean(), 2),'words')

Mean number of words per tweet: 12.31 words


In [None]:
df['tweet_length'] = df['text'].str.len()
print("Total length of the dataset is:",df.tweet_length.sum(), "characters")

Total length of the dataset is: 781347 characters


In [None]:
print("Mean length of a tweet is:", round(df.tweet_length.mean(),0),'characters')

Mean length of a tweet is: 76.0 characters


# Feature engineering (before exclusion of a few important data)

In [None]:
def has_element(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    new_tweet = None
    for i in r:
        new_tweet = re.sub(i, '', tweet)
    
    if new_tweet and new_tweet != tweet:
        return True
    r = re.findall(pattern2, tweet)
    for i in r:
        new_tweet = re.sub(i, '', tweet)
    if new_tweet and new_tweet != tweet:
        return True
    return False


In [None]:
df["has_user"] = np.vectorize(has_element)(df['text'], "@ [\w]*", "@[\w]*")

In [None]:
df["has_hashtag"] = np.vectorize(has_element)(df['text'], "# [\w]*", "#[\w]*")

In [None]:
df["has_url"] = np.vectorize(has_element)(df['text'], r"http\S+", r"http\S+")

## Remove @users
Twitter enables including usernames within tweets through the symbol “@.” These do not possess any value for our analysis; hence they are removed from the dataset using a function.

In [None]:
def remove_users(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
  
    r = re.findall(pattern2, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet
df['tidy_tweet'] = np.vectorize(remove_users)(df['text'],     "@ [\w]*", "@[\w]*")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label,tweet_length,has_user,has_hashtag,has_url,tidy_tweet
0,0,These girls are the equivalent of the irritati...,racism,99,False,True,False,These girls are the equivalent of the irritati...
1,1,Who is writing the bimbolines? #mkr,sexism,35,False,True,False,Who is writing the bimbolines? #mkr
2,2,"Colin will save them. They're pretty blondes, ...",sexism,75,False,True,False,"Colin will save them. They're pretty blondes, ..."
3,3,Which will end first: #mkr or Tony Abbott as PM?,none,48,False,True,False,Which will end first: #mkr or Tony Abbott as PM?
4,4,RT @TheAngelaOfOz: That's bullshit Colin and y...,none,62,True,True,False,RT : That's bullshit Colin and you know it. #mkr


## lowercase normalization

In [None]:
df['tidy_tweet'] = df['tidy_tweet'].str.lower()

## Remove hashtags (#)
Same as with usernames, hashtags also are considered not of significant value for topic modeling analysis, in particular, therefore, are removed.

** seria bom ver quais as hashtags mais frequentes **

In [None]:
df['tidy_tweet'] = np.vectorize(remove_users)(df['tidy_tweet'], "# [\w]*", "#[\w]*")
df.head(10)

## Remove links


In [None]:
def remove_links(tweet):
    tweet_no_link = re.sub(r"http\S+", "", tweet)
    return tweet_no_link
df['tidy_tweet'] = np.vectorize(remove_links)(df['tidy_tweet'])

## remove punctuations, numbers, special characters and short words

In [None]:
# REMOVE Punctuations, Numbers, and Special Characters
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

# REMOVE SHORT WORDS
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))

## words replaces

In [None]:
# retard and retarded
df['tidy_tweet'] = df['tidy_tweet'].str.replace("retarded", "retard")

## Lemmatization
An essential step of pre-processing is known as Tokenization. It is the process where the text is split according to whitespaces, and every word and punctuation is saved as a separate token. We perform this step by using spacy parser.

Proper nouns are also removed at this point


In [None]:
def tokenize_spacy(tweet):
    for text in tweet:
        doc = nlp(text)
        # filtered_sentence = [t.lemma_ for t in doc if t.pos_ not in ["PUNCT",'PROPN',"PRON"]] # remove proper nouns and pronouns
        filtered_sentence = [t.lemma_+"|"+t.pos_ for t in doc if t.pos_ not in ["PUNCT",'PROPN',"PRON","DET"]] # remove proper nouns and pronouns
        # pe
        yield(filtered_sentence) 

df['tidy_tweet_tokens'] = list(tokenize_spacy(df['tidy_tweet']))
df.head()


## Remove stopwords

Next, we remove stopwords that have no analytic value, usually articles, prepositions, or pronouns, for instance, ‘a,’ ‘and,’ ‘the,’ etc. The default list can be adjusted and extended as desired. We added some new words to the predefined list of Natural Language Toolkit (NLTK), which contains 179 words.

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Prepare Stop Words

stop_words = stopwords.words('english')
stop_words.extend([ 'https', 'twitter', 'rt', 'pic','twitt','amp','pron'])# REMOVE STOPWORDS

def remove_stopwords(tweets):
    return [[word for word in tweet if word.split("|")[0] not in stop_words] for tweet in tweets]

df['tokens_no_stop'] = remove_stopwords(df['tidy_tweet_tokens'])
df.head(10)

## REMOVE TWEETS LESS THAN 3 TOKENS
For topic modeling and also sentiment analysis, documents with less than three tokens are not suitable to generate enough information.

In [None]:
df['length'] = df['tokens_no_stop'].apply(len)
df = df.drop(df[df['length']<3].index)
df = df.drop(['length'], axis=1)
df.shape
df.reset_index(drop=True, inplace=True)
df.info()

## Word Clouds
### All tweets

In [None]:
flat_list = [item for sublist in df['tokens_no_stop'].values for item in sublist]

wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(Counter(flat_list))

plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
df.info()

### Offensive


In [None]:
offensive_df = df[~df.label.isin(['none','normal','NOT'])]
flat_list = [item for sublist in offensive_df['tokens_no_stop'].values for item in sublist]

wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(Counter(flat_list))

plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
offensive_df.info()

## Not Offensive


In [None]:
offensive_df = df[df.label.isin(['none','normal','NOT'])]
flat_list = [item for sublist in offensive_df['tokens_no_stop'].values for item in sublist]

wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(Counter(flat_list))

plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
offensive_df.info()

In [None]:
df.to_pickle("/content/project/My Drive/Doutorado/2020.1/Mineração de dados/ProjetoMD/preprocessed_pos")

In [None]:
flat_list = [item for sublist in df['tokens_no_stop'].values for item in sublist]
c = Counter(flat_list)
c.most_common(5)

In [None]:
df[['text','tokens_no_stop']]

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea