###### Cory Melendez
###### Natural Language Processing Project
###### https://github.com/cmelende/NLPProject.git
###### 12/11/20


###### 1. Import libraries, load dataset print shape of data, data description

In [1]:
import pandas as pd
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup

KeyboardInterrupt: 

In [None]:
data_source = './data/Tweets.csv'
nltk.download('stopwords')
english_stopwords = stopwords.words("english")
df = pd.read_csv(data_source)
print("Shape of Data is: ", df.shape)
print("Description of data is: /n", df.describe())


In [None]:
tweet_col = 'tweet_id'
airline_sentiment = 'airline_sentiment'
airline_sentiment_confidence = 'airline_sentiment_confidence'
negative_reason = 'negativereason'
negative_reason_confidence = 'negativereason_confidence'
airline = 'airline'
airline_sentiment_gold = 'airline_sentiment_gold'
name = 'name'
negative_reason_gold = 'negativereason_gold'
retweet_count = 'retweet_count'
text = 'text'
tweet_coord = 'tweet_coord'
tweet_created = 'tweet_created'
tweet_location = 'tweet_location'
user_timezone = 'user_timezone'
all_cols = [tweet_col, airline_sentiment, airline_sentiment_confidence,
            negative_reason, negative_reason_confidence, airline,
            airline_sentiment_gold, name, negative_reason_gold,
            retweet_count, text, tweet_coord, tweet_created,
            tweet_location, user_timezone]

##### 2. Understand the data columns

###### a. Drop all other columns except 'text' and 'airline_sentiment'

In [None]:
def remove_columns(df, keep_columns, all_columns):
    copy = pd.DataFrame()
    for col in all_columns:
        if col in keep_columns:
            copy[col] = df[col]

    return copy

###### b. Check the shape of the data

In [None]:
trimmed_df = remove_columns(df, [text, airline_sentiment], all_cols)
print("shape: ", trimmed_df.shape)

###### c. Print the first 5 rows

In [None]:
print("first 5 rows")
print(trimmed_df.head(5))

##### 3. Text Pre-processing: Data Preparation

###### a. Html tag removal
###### c. Remove the numbers
###### d. remove special characters and punctuations
###### e. conversion to lowercase
###### g. join the words in the list to convert back to text string in the dataframe (So that each row contains the data in text format)

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")    # Removing HTML tags
    return soup.get_text()

def remove_special_characters_numbers(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

def to_lowercase(text):
    text = text.lower()
    return text

def clean_text(text):
    text = strip_html(text)
    text = remove_special_characters_numbers(text)
    text = to_lowercase(text)
    return text

cleaned_df = trimmed_df.copy()
cleaned_df[text] = trimmed_df[text].apply(clean_text)

###### b. Tokenization

In [None]:
def tokenize(text):
    tokenizer=ToktokTokenizer()
    tokens=tokenizer.tokenize(text)
    return tokens

def remove_stopwords(text):
    token_array = tokenize(text)
    words = [w for w in token_array if not w in english_stopwords]
    reassembled_string = reassemble_token_array(words)
    return reassembled_string

def reassemble_token_array(token_array):
    space = ' '
    reassembled_string = space.join(token_array)
    return reassembled_string

cleaned_df[text] = cleaned_df[text].apply(remove_stopwords)

###### f. lemmatatize or stemming


In [None]:
import en_core_web_sm
nlp = en_core_web_sm.load()
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

cleaned_df[text] = cleaned_df[text].apply(lemmatize_text)

###### h. print first 5 rows of data after pre-processing

In [None]:
cleaned_df.head(5)