### Libraries 

In [149]:
import pandas as pd
import string 
import re 

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer 

### Import the dataset

In [150]:
file = 'Data/data.csv'
dataframe = pd.read_csv(file, error_bad_lines=False, sep=";")

### Data cleaning 

In [151]:
def data_cleaning(df):
    
    # Delete IDs
    df.drop('ID', axis=1, inplace=True)
    
    # First encoding 
    df['Label'].replace('none', 'not racist', inplace=True)
    df['Label'].replace('racism', 'racist', inplace=True)
    
    i = 0 
    for i in range(df['Tweets'].shape[0]):
        # Remove punctuation
        df['Tweets'][i] = "".join([char for char in df['Tweets'][i] if char not in string.punctuation])
        
        # Stop words 
        #df['Tweets'][i] = "".join([char for word in df['Tweets'][i] if char not in string.punctuation])
        
        # Uppercase -> Lowercase 
        df['Tweets'][i] = df['Tweets'][i].lower()
        
        # Delete Url 
        df['Tweets'][i] = re.sub(r'http\S+', '', df['Tweets'][i])
        
        # Delete characters 
        df['Tweets'][i] = re.sub("ð|ÿ|‘|œ|¦|€|˜|™|¸|¤|‚|©|¡|…|”|“|‹|š|±|³|iâ|§|„|", '', df['Tweets'][i]) 
        
    return df

### Tokenization

In [152]:
def tokenization(df):
    # Generate tokens
    tknz = TweetTokenizer()
    tokens = []
    
    i = 0
    for i in range(df['Label'].shape[0]):
        tokens.extend(tknz.tokenize(df['Tweets'][i]))
    
    return tokens

### Stemming 

In [153]:
def stemming(tokens):
    stemming = PorterStemmer()
    for token in tokens:
        token = stemming.stem(token)
    return tokens

### Tokens_Frequencies 

In [154]:
def tokens_frequencies(tokens):
    
    # Creation of a dataframe Tokens-Frequencies
    fdist = FreqDist()
    for token in tokens:
        fdist[token] += 1 
    tokens_freq = pd.DataFrame(list(fdist.items()), columns = ["Tokens","Frequencies"])
    
    # Sort the dataframe according to frequency of words
    tokens_freq.sort_values(by='Frequencies',ascending=False, inplace=True)
    
    # Generate a CSV file for Tokens-Frequencies
    #tokens_frequencies.to_csv("Word-Frenquency.csv")
    
    return tokens_freq

### Vectorization

In [155]:
def vectorization(df, nbr_tokens, nbr_tweets, token_frequency):
    # Most frequent tokens
    most_freq = token_frequency.iloc[:nbr_tokens,:]

    # Vectorization 
    matrix = []
    for tweet in df['Tweets'][:nbr_tweets]:
        vector = []
        tknz = TweetTokenizer()
        tweet = tknz.tokenize(tweet)
        for token in most_freq['Tokens']:
            if token in tweet:
                vector.append(1)
            else:
                vector.append(0)
        matrix.append(vector)
    
    # Convert the matrix into a dataframe
    bag_of_words = pd.DataFrame(matrix, columns=most_freq['Tokens'].to_list())
    
    return bag_of_words

### Preprocessing 

In [156]:
def preprocessing(dataset, nbr_tokens, nbr_tweets):
    
    # Copy the dataset
    df = dataset.copy()
    
    # manipulations
    df_cleaned = data_cleaning(df)
    
    # tokenization
    tokens = tokenization(df_cleaned)
    
    # stemming
    tokens_stemmed = stemming(tokens)
    
    # tokens_frequencies 
    tokfreq = tokens_frequencies(tokens_stemmed)
    
    # vectorization
    bag_of_words = vectorization(df, nbr_tokens, nbr_tweets, tokfreq)
    
    return bag_of_words

In [157]:
# Is not used yet 
def stop_words(df):
    i = 0 
    for word in df['Tokens']:
        if word in stopwords.words('english'):
            df.drop(i, axis=0, inplace=True, ignore_index=True)
        i+=1
    return df

### Test preprocessing 

In [158]:
preprocessing(dataframe, nbr_tokens=10, nbr_tweets=100)

Unnamed: 0,the,to,mkr,a,i,and,is,of,rt,you
0,1,0,0,0,0,0,1,1,0,0
1,0,0,1,0,0,0,0,0,0,0
2,1,1,0,1,0,1,0,0,1,0
3,0,0,0,1,1,1,0,1,1,0
4,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
95,1,0,1,0,0,0,1,0,1,0
96,1,0,0,0,0,0,0,0,0,1
97,1,0,0,0,0,0,1,0,0,0
98,1,1,0,0,0,1,0,0,0,0
