In [113]:
# import libraries
import numpy as np
import pandas as pd
import re
from tqdm import tqdm

In [125]:
data = pd.read_json("./result_data.json") # load data

# Remove non-english quotes
We manually checked quotes of the speakers with a lot of (more than 100) quotes  and found non-english speakers.

In [119]:
non_english = ['rodrigo duterte', 'grace poe', 'leni robredo', 'harry roque',
       'salvador panelo', 'oscar albayalde', 'narendra modi',
       'pantaleon alvarez', 'antonio trillanes iv', 'manny pacquiao',
       'leila de lima', 'rahul gandhi', 'risa hontiveros', 'eduardo año',
       'benigno aquino iii', 'salman khan', 'panfilo lacson', 'jejomar binay',
       'alan peter cayetano', 'mar roxas', 'amit shah', 'martin andanar', 'yeng guiao',
     'vice ganda', 'bam aquino', 'francisco duque',
     'delfin lorenzana','franklin drilon', 'sherwin gatchalian',
     'leo austria', 'nancy binay','ralph recto','sara duterte-carpio','arwind santos',
     'francis pangilinan','marc pingris','francis escudero','francis tolentino',
     'la tenorio','aldin ayo','silvestre bello iii','alyssa valdez',
     'gerald anderson', 'pia wurtzbach','sarah geronimo','maine mendoza',
     'sharon cuneta ','marian rivera','maria lourdes sereno','joel villanueva',
     'vitaliano aguirre ii','calvin abueva','dingdong dantes', 'kathryn bernardo', 'hidilyn diaz',
     'maja salvador','vico sotto', 'willie marcial', 'antonio tinio',
     'cynthia villar', 'lito atienza', 'mark villar', 'bea alonzo', 'jeff napa',
     'kim chiu ', 'alden richards', 'angelica panganiban', 'gwendolyn garcia', 'boy abunda ',
     'louie alas ', 'coco martin', 'mark barroca', 'mitt romney', 'scottie thompson',
     'james yap', 'diarmuid martin', 'daniel padilla', 'bo perasol', 'abigail valte',
     'bongbong marcos', 'carlos isagani zarate', 'tito sotto', 'jerwin ancajas', 'oscar moreno',
     'juan miguel zubiri', 'judy taguiwalo', 'florin hilbay', 'joseph estrada',
     'regine velasquez', 'renato reyes', 'lea salonga', 'leonor briones', 'benjamin diokno',
     'beau belga', 'julia montes', 'lolit solis', 'edwin lacierda', 'julia barretto',
     'pido jarencio', 'francis zamora', 'mac belo', 'kai sotto', 'alex gonzaga', 'dennis trillo', 'getulio napeñas',
     'maymay entrata', 'jinggoy estrada', 'barry gutierrez', 'tony gonzaga', 'bernadeth pons',
     'john lloyd cruz', 'carlos conde', 'aby maraño', 'ramon s. ang', 'joey salceda',
     'juan ponce enrile','heart evangelista', 'jake cuenca', 'antoinette jadaone', 'myla pablo']

Some non-english quotes include "gay" with different suffixes. We found these suffixes and filtered them.

In [126]:
# Remove non-english words
print("Before removing words:", len(data))
dropped_idx = []
for idx, quote in enumerate(data.Quote):
    for elem in ["gayi", "gaye", "gaya", "gayu"]:
        if elem in quote:
            dropped_idx.append(idx)
data.drop(dropped_idx, axis=0, inplace=True)
print("After removing words:", len(data))

Before removing words: 209000
After removing words: 202176


In [127]:
# Remove non-english speakers
data = data[~data.Speaker.isin(non_english)]
print("After removing non-english speakers", len(data))

After removing non-english speakers 191642


# Remove similar quotes
The dataset contains similar quotes as shown in Milestone 2. We found quotes with similar prefixes and kept the longest ones from them.

In [128]:
speakers_count = data.Speaker.value_counts()
speakers = speakers_count[speakers_count > 1].keys()  # speakers with more than 1 quote (=> there can be repeated)
clean_df = []  # accumulate dataframes without similar quotes for different speakers
for current_speaker in tqdm(speakers):
    quotes = sorted(data.Quote[data.Speaker == current_speaker], key=lambda x: (x, len(x)))  # sort alphabetically and by length
    # normalize quotes by case, symbols and space quantity; keep the longest
    starts = pd.Series([re.sub("\s\s+" , " ", re.sub(r'[^\w]', " " , text[:30].lower())) for text in quotes]).duplicated(keep="last")
    new_quotes = pd.Series(quotes, name="Quote").drop(np.where(starts.values)[0])  # filtering
    clean_df.append(data[data.Speaker == current_speaker].merge(new_quotes, on="Quote").dropna())  # get all columns for unique quotes
full_clean = pd.concat(clean_df)  # accumulate to 1 dataframe

100%|██████████| 19826/19826 [15:18<00:00, 21.58it/s] 


In [129]:
full_clean.head()  # check resulting data

Unnamed: 0,date_of_birth,nationality,gender,occupation,Speaker,Quote,numOccurrences,quote_year,quote_month
0,1971,[United States of America],[female],[LGBTIQ+ rights activist],sarah kate ellis,This loss is a wake-up call that despite remar...,2,2015,11
1,1971,[United States of America],[female],[LGBTIQ+ rights activist],sarah kate ellis,"She didn't see it, she hadn't heard of it, she...",1,2015,6
2,1971,[United States of America],[female],[LGBTIQ+ rights activist],sarah kate ellis,As a journalist and anchor who reaches million...,1,2015,4
3,1971,[United States of America],[female],[LGBTIQ+ rights activist],sarah kate ellis,By empowering people to talk about their gende...,6,2015,2
4,1971,[United States of America],[female],[LGBTIQ+ rights activist],sarah kate ellis,"By investing in this dangerous programming, TL...",133,2015,1


# Filter non-english quotes
When we had decreased size of a dataset, we used langdetect to find only english quotes automatically.

In [132]:
import langdetect

def is_tweet_english(tweet_text):
    """
    This method returns whether or not
    a Tweet's text is in English.
    """
    try:
        return langdetect.detect(tweet_text) == 'en'
    except:
        return False

In [133]:
full_clean.reset_index(drop=True, inplace=True)  # reset indexes
filtered_df = full_clean[full_clean['Quote'].apply(lambda x: is_tweet_english(x))]  # apply langdetect
print("Size of clean dataset:", filtered_df.shape)  # check shape 
filtered_df.to_json("filtered.json")  # save dataframe

Size of clean dataset: (114746, 9)
