In [35]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [36]:
#load all datasets
wiki_words = pd.read_csv('nnyt-data-v3-wiki.csv')
urban_dictionary_words = pd.read_csv('nnyt-data-v4-urban-dictionary.csv')
dictionary_words = pd.read_csv('nnyt-data-v5-dictionary.csv')
other_language_words = pd.read_csv('nnyt-data-google-trans-final.csv')
article_search_words = pd.read_csv('nnyt-data-v2-article-search.csv')

#drop all old index columnbs
for dataset in [wiki_words, urban_dictionary_words, other_language_words, article_search_words]:
    dataset.drop(labels=["Unnamed: 0"], axis=1, inplace=True)

In [37]:
#merge wiki and urban dictitonary
wiki_ud = wiki_words.merge(urban_dictionary_words, on=["id", "full_text", "word_len", "favorite_count", "retweet_count", "date", "hour", "time", "created_at"])
#merge in dictionary
wiki_ud_dict = wiki_ud.merge(dictionary_words, on=["id", "full_text", "word_len", "favorite_count", "retweet_count", "date", "hour", "time", "created_at"])
#merge in google trans
wiki_ud_dict_gt = wiki_ud_dict.merge(other_language_words, on=["id", "full_text", "word_len", "favorite_count", "retweet_count", "date", "hour", "time", "created_at"])
#merge in article search
final_data = wiki_ud_dict_gt.merge(article_search_words, on=["id", "full_text", "word_len", "favorite_count", "retweet_count", "date", "hour", "time", "created_at"])
#drop out irrelevant columns and check data looks correct
final_data.drop(labels=["Unnamed: 0", "Unnamed: 0.1_x", "Unnamed: 0.1_y", "wiki_description", "urban_dictionary_definition", "guessed_language"], axis=1, inplace=True)
#convert boolean columns to binary ints
final_data = final_data.astype({"is_in_wiki": 'int', 'is_in_urban_dictionary': 'int', 'is_in_dictionary': 'int', 'is_other_language': 'int'})
final_data

Unnamed: 0,id,full_text,word_len,favorite_count,retweet_count,date,hour,time,created_at,is_in_wiki,is_in_urban_dictionary,is_in_dictionary,is_other_language,article_search_json
0,1.318931e+18,punkify,7,203,44,2020-10-21,15,15:01:57,2020-10-21 15:01:57,0,0,0,0,{'abstract': 'His outlandish costumes embellis...
1,1.318885e+18,amiante,7,11,0,2020-10-21,12,12:01:52,2020-10-21 12:01:52,0,0,0,1,{'abstract': 'The Quebec town is home to one o...
2,1.318613e+18,pentalogue,10,10,0,2020-10-20,18,18:01:58,2020-10-20 18:01:58,0,0,0,0,{'abstract': '“The Silence” invites readers to...
3,1.318613e+18,apocalypst,10,60,5,2020-10-20,18,18:01:52,2020-10-20 18:01:52,0,0,0,0,{'abstract': '“The Silence” invites readers to...
4,1.318485e+18,casedemic,9,13,2,2020-10-20,9,09:33:31,2020-10-20 09:33:31,0,0,0,0,{'abstract': 'Against both the coronavirus and...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5645,,money:around,12,0,0,2017-10-15,11,11:01:00,2017-10-15 11:01:00+00:00,0,0,0,0,{'abstract': 'Robert Trice is pursuing his qui...
5646,,briar,5,0,0,2017-10-15,11,11:00:58,2017-10-15 11:00:58+00:00,0,1,1,0,{'abstract': 'In Emma Donoghue’s first childre...
5647,,additively,10,0,0,2017-10-15,10,10:01:09,2017-10-15 10:01:09+00:00,0,0,0,0,{'abstract': 'Natural coloring agents made fro...
5648,,subtractively,13,0,0,2017-10-15,10,10:01:09,2017-10-15 10:01:09+00:00,0,0,0,0,{'abstract': 'As part of the nearly $800 milli...


In [38]:
print("# in Wikipedia: ", len(final_data.loc[final_data["is_in_wiki"] == 1]))
print("# in Urban Dictionary: ", len(final_data.loc[final_data["is_in_urban_dictionary"] == 1]))
print("# in Dictionary.com: ", len(final_data.loc[final_data["is_in_dictionary"] == 1]))
print("# in Other Language (Google Trans detected): ", len(final_data.loc[final_data["is_other_language"] == 1]))
print("# found in NYT Article Search: ", len(final_data.loc[final_data["article_search_json"] != 'False']))

# in Wikipedia:  502
# in Urban Dictionary:  581
# in Dictionary.com:  146
# in Other Language (Google Trans detected):  866
# found in NYT Article Search:  5311


In [39]:
try: final_data.insert(9, "combined_sources", ['none' for i in range(len(final_data))])
except: pass

for i, row in final_data.iterrows():
    combined_sources = ''
    if row["is_in_wiki"] == 1:
        combined_sources = combined_sources + 'wiki '
        final_data.at[i, "combined_sources"] = combined_sources.strip()
    if row["is_in_urban_dictionary"] == 1:
        combined_sources = combined_sources + 'urbdict '
        final_data.at[i, "combined_sources"] = combined_sources.strip()
    if row["is_in_dictionary"] == 1:
        combined_sources = combined_sources + 'dict '
        final_data.at[i, "combined_sources"] = combined_sources.strip()
    if row["is_other_language"] == 1:
        combined_sources = combined_sources + 'othlang'
        final_data.at[i, "combined_sources"] = combined_sources.strip()
    
final_data["combined_sources"].value_counts()

none                    3875
othlang                  702
urbdict                  419
wiki                     301
wiki othlang              72
urbdict othlang           65
dict                      61
wiki urbdict              59
wiki dict                 42
urbdict dict              16
wiki urbdict dict         11
wiki urbdict othlang      11
dict othlang              10
wiki dict othlang          6
Name: combined_sources, dtype: int64

In [40]:
final_data["combined_sources"].unique()

array(['none', 'othlang', 'wiki', 'dict', 'urbdict', 'wiki urbdict dict',
       'wiki urbdict', 'urbdict othlang', 'urbdict dict', 'wiki othlang',
       'wiki dict', 'wiki urbdict othlang', 'dict othlang',
       'wiki dict othlang'], dtype=object)

In [41]:
##mining info from article search
final_data.insert(15, "byline", ['' for i in range(len(final_data))])
final_data.insert(16, "web_url", ['' for i in range(len(final_data))])
final_data.insert(17, "news_desk", ['' for i in range(len(final_data))])
final_data.insert(18, "section_name", ['' for i in range(len(final_data))])
final_data.insert(19, "type_of_material", ['' for i in range(len(final_data))])
final_data.insert(19, "pub_date", ['' for i in range(len(final_data))])

for index, row in final_data.iterrows():
    #if there is article json, pull info from it
    if row['article_search_json'] != 'False':
        test = row['article_search_json'].split(", ")
        for i in test:
            i = i.replace("'", "")

            #pull out original byline...
            if "byline" in i.split(" ")[0]:
                rough_byline = i.split("{original: ")[1].replace("By ", "")
                final_data.at[index, 'byline'] = str(rough_byline)
            #get url
            if "web_url" in i:
                url = (i.split(" ")[1]) 
                final_data.at[index, 'web_url'] = url
            #get news desk
            if "news_desk" in i:
                news_desk = i.split(": ")[1]
                final_data.at[index, 'news_desk'] = news_desk
            #get section
            if "section_name" in i:
                section = i.split(": ")[1]
                final_data.at[index, 'section_name'] = section
            #get section
            if "type_of_material" in i:
                type_material = i.split(": ")[1]
                final_data.at[index, 'type_of_material'] = type_material
            #get date
            if "pub_date" in i:
                pub_date = i.split(": ")[1]
                final_data.at[index, 'pub_date'] = pub_date

In [42]:
final_data.drop(labels=["article_search_json"], axis=1, inplace=True)
final_data.head()

Unnamed: 0,id,full_text,word_len,favorite_count,retweet_count,date,hour,time,created_at,combined_sources,is_in_wiki,is_in_urban_dictionary,is_in_dictionary,is_other_language,byline,web_url,news_desk,section_name,pub_date,type_of_material
0,1.318931e+18,punkify,7,203,44,2020-10-21,15,15:01:57,2020-10-21 15:01:57,none,0,0,0,0,Penelope Green,https://www.nytimes.com/2020/10/21/arts/robert...,Obits,Arts,2020-10-21T14:33:16+0000,Obituary (Obit)
1,1.318885e+18,amiante,7,11,0,2020-10-21,12,12:01:52,2020-10-21 12:01:52,othlang,0,0,0,1,Marie Fazio,https://www.nytimes.com/2020/10/21/world/ameri...,Express,Americas,2020-10-21T11:38:52+0000,News
2,1.318613e+18,pentalogue,10,10,0,2020-10-20,18,18:01:58,2020-10-20 18:01:58,none,0,0,0,0,Joshua Cohen,https://www.nytimes.com/2020/10/20/books/revie...,Books,Book Review,2020-10-20T17:50:41+0000,Review
3,1.318613e+18,apocalypst,10,60,5,2020-10-20,18,18:01:52,2020-10-20 18:01:52,none,0,0,0,0,Joshua Cohen,https://www.nytimes.com/2020/10/20/books/revie...,Books,Book Review,2020-10-20T17:50:41+0000,Review
4,1.318485e+18,casedemic,9,13,2,2020-10-20,9,09:33:31,2020-10-20 09:33:31,none,0,0,0,0,Ross Douthat,https://www.nytimes.com/2020/10/20/opinion/tru...,OpEd,Opinion,2020-10-20T09:00:10+0000,Op-Ed


In [43]:
final_data.to_csv('final-nnyt-data.csv')