### AIDI 1002 Lab2

In question one of this lab we are performing text pre-processing for a sample data downloaded from Kaggle. Second question is dedicated to analyzing of time series data

#### Question 1

In [24]:
# Importing used libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import words
from collections import Counter
# this is for tokenization
# nltk.download('punkt')
# # this packages are for lemmatizer
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('words')

In [11]:
# Global Vairable
# Stop word
STOP_WORDS = set(stopwords.words('english'))
# Lemmatizer
LEMMATIZER = WordNetLemmatizer()
# English Words
ENGLISH_WORDS = set(words.words())

In [22]:
# reading the csv file
df = pd.read_csv('sample_kaggle_data.csv')
print(len(df))
df.head(10)

31962


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


Dataset has almost 32k rows. The texts are from twitter and as its visable alot of stop words, punctuations and non englihs words or characters. We are assuming that in our application those words are not needed. We created set of functions to remove numbers, remove punctuations, remove starting and ending spaces, tokenize columns, remove stop words, lemmatize wrods according to the tag.

In [47]:
# list of functions that are responsible for preprocessing
def load_csv(path: str) -> 'dataframe':
    '''
    This functions loads csv file
    input: path to the file
    output: pandas dataframe
    '''
    return pd.read_csv(path)


def drop_columns(df: 'dataframe', cols: list):
    '''
    This funciton remove the columns that are specified
    input: cols -> list of columns that will be dropped
    output: datafrae
    '''
    df.drop(columns=cols, inplace=True)

def lower_case(df: 'dataframe', col: str) -> 'dataframe':
    '''
    This function drop null value columns and
    change the column to lower case
    input: df -> dataframe, col -> column
    output: dataframe
    '''
    df = df[df[col].notna()]
    df[col] = df[col].str.lower()
    return df
    
# NOTE: This function should only be used if its applicable to your
# project. It removes numbers from the text
def remove_numbers(df: 'dataframe', col: str):
    '''
    This function remove numbers from the specified column in
    the dataframe
    input: df -> dataframe, col: targeted column
    '''
    df[col] = df[col].str.replace(r'\d+', '', regex=True)

    
# NOTE: This function should only be used if its applicable to your
# project. It removes punctuations from the text
def remove_punctuation(df: 'dataframe', col:str) -> 'dataframe':
    '''
    This function removes punctuations from the specified colum
    input: df -> dataframe, col: column
    output: dataframe
    '''
    df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)

    
def remove_white_space(df: 'dataframe', col:str):
    '''
    This function removes leading and ending white spaces from a text
    input: df -> dataframe, col: column
    output: dataframe
    '''
    df[col] = df[col].str.strip()
        

def tokenize(col: str) -> list:
    '''
    This function tokenizes the specified column
    input: col -> column
    output: list of tokenized sentence
    '''
    tokens = nltk.word_tokenize(col)
    return tokens


def create_token_col(df: 'dataframe', col: str):
    '''
    This function creates a column of tokenized column
    input: df -> dataframe, col -> column
    '''
    df[f'{col}_tokenized'] = df[col].apply(tokenize)

    
def remove_non_english(arr: list) -> list:
    '''
    This function removes non-english words from the column
    input: arr -> list of words
    output: list of words excluding non-english words
    '''
    just_english = []
    for word in arr:
        if word in ENGLISH_WORDS:
            just_english.append(word)
    return just_english


def apply_non_english(df: 'dataframe', col: str):
    '''
    This function applies remove non english function above
    to the specified column
    input: df -> dataframe, col -> column
    '''
    df[col] = df[col].apply(remove_non_english)

    
def remove_stop_words(arr: list) -> list:
    '''This function removes english stop words
    input: arr -> array that we wont to remove stop
    words from
    output: an array without stop words
    '''
    no_stop_word = []
    for word in arr:
        if word not in STOP_WORDS:
            no_stop_word.append(word)
    return no_stop_word

def apply_stop_words(df: 'dataframe', col: str):
    '''
    This function removes the stopwords in the column
    input: col -> column name
    output
    '''
    df[col] = df[col].apply(remove_stop_words)

    
def word_type(word: str) -> str:
    '''
    This function returns type of word noun, verb ...
    input: word -> word
    output: string indicating type of word
    '''
    return wordnet.synsets(word)[0].pos()


def lem(col):
    lemmatized_words = []
    for word in col:
        try:
            lemmatized = LEMMATIZER.lemmatize(word,word_type(word))
            lemmatized_words.append(lemmatized)
        except:
            lemmatized = LEMMATIZER.lemmatize(word)
            lemmatized_words.append(lemmatized)
    return lemmatized_words


def create_lem_col(df:'dataframe', col: str) -> 'dataframe':
    '''
    This function lemmatize the column
    input: col -> column
    output: datafreame
    '''
    df[f'{col}_lemmatized'] = df[col].apply(lem)

In [74]:
def pipeline_one():
    '''This function applies higher level pipeline
    functions
    '''
    print('loading dataframe...')
    df1 = load_csv('sample_kaggle_data.csv')
    print('removing redundant columns...')
    drop_columns(df1, ['id','label'])
    print('changing words to lower case...')
    df1 = lower_case(df1, 'tweet')
    print('removing punctuations...')
    remove_punctuation(df1, 'tweet')
    print('removing numbers...')
    remove_numbers(df1, 'tweet')
    print('removing white spaces...')
    remove_white_space(df1, 'tweet')
    print('tokenizing targetted column...')
    create_token_col(df1, 'tweet')
    print('removing non-english words...')
    apply_non_english(df1,'tweet_tokenized')
    print('remove stopwords...')
    apply_stop_words(df1, 'tweet_tokenized')
    print('lemmatize...')
    create_lem_col(df1, 'tweet_tokenized')
    return df1

In [77]:
df1 = pipeline_one()

loading dataframe...
removing redundant columns...
changing words to lower case...
removing punctuations...
removing numbers...
removing white spaces...
tokenizing targetted column...
removing non-english words...
remove stopwords...
lemmatize...


In [79]:
df1.tail(10)

Unnamed: 0,tweet,tweet_tokenized,tweet_tokenized_lemmatized
31952,user you went too far with user,"[user, went, far, user]","[user, go, far, user]"
31953,good morning instagram shower water berlin ber...,"[good, morning, shower, water, berlin, girl]","[good, morning, shower, water, berlin, girl]"
31954,holiday bull up you will dominate your bull ...,"[holiday, bull, dominate, bull, direct, whatev...","[holiday, bull, dominate, bull, direct, whatev..."
31955,less than weeks ððð¼ð¹ððµ user ibizabringito...,"[less, user]","[le, user]"
31956,off fishing tomorrow user carnt wait first tim...,"[fishing, tomorrow, user, wait, first, time]","[fishing, tomorrow, user, wait, first, time]"
31957,ate user isz that youuuðððððððððâï,"[ate, user]","[ate, user]"
31958,to see nina turner on the airwaves trying to w...,"[see, turner, trying, wrap, mantle, genuine, h...","[see, turner, try, wrap, mantle, genuine, hero..."
31959,listening to sad songs on a monday morning otw...,"[listening, sad, morning, work, sad]","[listening, sad, morning, work, sad]"
31960,user sikh temple vandalised in in calgary wso ...,"[user, temple, act]","[user, temple, act]"
31961,thank you user for you follow,"[thank, user, follow]","[thank, user, follow]"
