In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import bz2
import json
from tld import get_tld
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
import math
from collections import Counter
from operator import itemgetter
import re
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lisalaurent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lisalaurent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from Data_clean_functions import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lisalaurent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lisalaurent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### CREATE PATHS

In [3]:
DATA_PATH = './Data/'
FILE2016 = DATA_PATH + 'quotes-2016.json.bz2'
PATH_OUT = DATA_PATH + 'clean-quotes-2016.json.bz2'
#FILE2020 = DATA_PATH + 'quotes-2020.json.bz2'
#df_base = pd.read_json(FILE2016, lines=True, compression='bz2', nrows=10000)

In [4]:
df_base = pd.read_json(FILE2016, lines=True, compression='bz2', nrows=100)
df_base

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2016-12-26-000040,[ ] and Chris [ Jones ] were in there a lot an...,Andy Reid,"[Q2622812, Q27830815, Q470738, Q4761219]",2016-12-26 20:05:00,1,"[[Andy Reid, 0.9432], [None, 0.0541], [Trevor ...",[http://www.kcchiefs.com/news/article-2/How-a-...,E
1,2016-07-31-000006,[ And ] I don't know if we have enough time to...,Mike Howe,[Q6847325],2016-07-31 08:22:12,2,"[[Mike Howe, 0.7118], [None, 0.2882]]",[http://www.peninsuladailynews.com/apps/pbcs.d...,E
2,2016-09-06-000292,... I feel like I was champion long before I l...,,[],2016-09-06 20:54:45,2,"[[None, 0.6877], [John Waters, 0.3123]]",[http://onlineathens.com/breaking-news/2016-09...,E
3,2016-07-11-000226,[ I ] mmigration has been and continues to be ...,Hillary Clinton,[Q6294],2016-07-11 17:26:06,1,"[[Hillary Clinton, 0.9025], [None, 0.0975]]",[http://www.breitbart.com/tech/2016/07/11/hill...,E
4,2016-05-26-000371,[ It is ] the process of understanding what ki...,Bruce Maxwell,[Q26129591],2016-05-26 15:21:37,1,"[[Bruce Maxwell, 0.8178], [None, 0.1822]]",[http://www.scout.com/mlb/athletics/story/1673...,E
...,...,...,...,...,...,...,...,...,...
95,2016-08-23-002448,a revolver with one chamber empty and a black ...,William Quinn,[Q27789027],2016-08-23 11:32:37,1,"[[William Quinn, 0.7352], [None, 0.2648]]",[http://m.spokesman.com/stories/2016/aug/24/10...,E
96,2016-12-24-001062,A Savior Born for Us.,,[],2016-12-24 00:00:00,1,"[[None, 0.9235], [John Roberts, 0.0765]]",[http://www.journal-news.net/life/faith/2016/1...,E
97,2016-05-19-002696,a sheer stroke of luck,Hayley Squires,[Q27050132],2016-05-19 02:49:27,4,"[[Hayley Squires, 0.662], [None, 0.338]]",[http://www.themalaymailonline.com/showbiz/art...,E
98,2016-05-26-002760,a significant gap between the rhetoric and rea...,,[],2016-05-26 15:27:42,1,"[[None, 0.5965], [David J, 0.32], [David Camer...",[https://www.rt.com/uk/344486-eu-army-brexit-m...,E


## Create a dictionnary of categories and associated synonyms

In [None]:
import nltk
nltk.download()
from nltk.corpus import wordnet

synonyms = []
antonyms = []

for syn in wordnet.synsets("business"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))

# Creating a list of synonyms for the dicitionary

matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"], "sport": ["sport", "football"]}

for category in matchers:
    synonyms = []
    for i in range(len(matchers[category])):
        word = matchers[category][i]
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
        matchers[category] = synonyms



In [5]:
stemmer = PorterStemmer()
matchers = {"art": ["art", "paint", "draw"], "business": ["business", "finance", "economy"], "sport": ["sport", "football"]}
generalizeDictionary(matchers)

### PROCESSING DATA

In [6]:
def process_chunk_complete(chunk, threshold_proba, matchers):
    print(f'Processing chunk with {len(chunk)} rows')
    #DATA CLEANING
    #Remove None speakers
    chunk = Remove_none_speakers(chunk)
    
    #Remove none unique ids and keep the first one
    chunk = Remove_none_unique_ids(chunk)
        
    #Remove nan or empty quotes 
    chunk = Remove_empty_quotes(chunk)

    #Remove speakers for which probability is lower than a threshold
    chunk = Remove_low_proba(chunk, threshold_proba)
    
    #URLS DATA EXTRACTION
    chunk = Chunk_url_extract(chunk, matchers)
    
    tot_length = len(chunk)
    return chunk, tot_length

In [45]:
with pd.read_json(FILE2016, lines=True, compression='bz2', chunksize=1000) as df_reader:
    with bz2.open(PATH_OUT, 'wb') as d_file:
        for chunk in df_reader:
            chunk_cleaned, chunk_length = process_chunk_complete(chunk, 0.5, matchers)
            chunk_json = chunk_cleaned.to_json(orient='columns')#, index=False)
            d_file.write((chunk_json+'\n').encode('utf-8'))

Processing chunk with 1000 rows
Processing chunk with 1000 rows


KeyboardInterrupt: 

In [49]:
df = pd.read_json(DATA_PATH + 'clean-quotes-2016.json.bz2', orient='columns', compression='bz2', nrows=100)
df

ValueError: nrows can only be passed if lines=True

In [44]:
chunk_cleaned.to_json(PATH_OUT, orient='columns')

In [39]:
chunk_json

