#Project 4: Natural Language Processing & New York Times Data
Eve Dean

##Notebook Outline:
The purpose of this notebook is to subset the original data into a useable chunks for preprocessing and NLP. Data is also grouped by decade with 1000 rows selected per decade. 

##Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Load data & remove NA

In [2]:
df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/Project 4/Data/nyt_data.parquet')

In [3]:
df.isna().sum()

year       0
title      0
excerpt    0
dtype: int64

In [4]:
df = df.dropna()

In [5]:
df.isna().sum()

year       0
title      0
excerpt    0
dtype: int64

#Subset just articles from 1970

In [6]:
df_1970 = df[df['year'] == 1970]
df_1970.head()

Unnamed: 0,year,title,excerpt
10366505,1970,Hudson County grand jury indicts Freeholder C ...,contract later rescinded; no money changed han...
10366506,1970,"Bklyn Fed Ct judge says Panamanian law, not Gr...",says generator breakdown was 'unforeseen event...
10366507,1970,"King Faisal arrives, Jakarta, for 3-day visit ...",
10366508,1970,Educ Bd says virtually all substitute position...,Asst Supt S Rosenberg says elimination is cond...
10366509,1970,US preliminary defense trials Valiant Twice Ou...,


In [7]:
df_1970.to_csv('1970_df.csv', index=False)

#Subset 500 Random Rows

In [8]:
mini_df = df.sample(n = 500)
print(mini_df.shape)
mini_df.head()

(500, 3)


Unnamed: 0,year,title,excerpt
12497018,1987,LEAD: *3*** COMPANY REPORTS ** *3*F...,
16902395,2015,There has been no fighting in Western Sahara f...,
1352909,1929,Will represent U S at 2d internatl diplomatic ...,
15010004,2004,Nov 16 picture caption with article about incr...,"it is Penneys, which is part of Associated Bri..."
14047149,1998,Carol J Christie letter objects to religious i...,


In [9]:
mini_df.to_csv('mini_df.csv', index=False)

In [10]:
for i in np.arange(mini_df.shape[0]):
  with open(str(i) + '.txt', 'w') as f:
    f.write(df.loc[i, 'title'])

#Subset 1000 rows from each decade

In [11]:
# Extract the decade from the 'year' column
df['decade'] = (df['year'] // 10) * 10

# Group the data by decade
grouped = df.groupby('decade')

In [12]:
df.head()

Unnamed: 0,year,title,excerpt,decade
0,1920,At last the Federal Reserve Board has issued r...,,1920
1,1920,WILL TEST DOOR SERVICE.,Service Board to Further Examine I.R.T. Safety...,1920
2,1920,Sanction for Chinese Contracts.,,1920
3,1920,"LEADS FRAZIER BY 4,496.",Langer's Margin Falls in North Dakota--Gronna ...,1920
4,1920,"CHICAGO, April 30.--With 300 suspicious charac...",Federal Agents and Police Round-- up Suspiciou...,1920


In [13]:
grouped.head()

Unnamed: 0,year,title,excerpt,decade
0,1920,At last the Federal Reserve Board has issued r...,,1920
1,1920,WILL TEST DOOR SERVICE.,Service Board to Further Examine I.R.T. Safety...,1920
2,1920,Sanction for Chinese Contracts.,,1920
3,1920,"LEADS FRAZIER BY 4,496.",Langer's Margin Falls in North Dakota--Gronna ...,1920
4,1920,"CHICAGO, April 30.--With 300 suspicious charac...",Federal Agents and Police Round-- up Suspiciou...,1920
1498321,1930,Besides the speech of John D. Rockefeller to h...,Glimpse of Karolyi on Arrival Here Also Shown ...,1930
1498322,1930,NEW YORK WEEKLY BANK STATEMENTS,,1930
1498323,1930,Reviews work of Jacob H Schiff Centre CHILDREN...,300 Salute Portrait of the Philanthropist at S...,1930
1498324,1930,A fractionally higher call-money rate with a m...,Narrow Movement on Stock Exchange--Call Money ...,1930
1498325,1930,FIVE PLAYERS QUALIFY FOR CUE ROUND ROBIN,"Waldron, Snead, Smith, Blake and Marlow in Fin...",1930


In [14]:
# Create an empty DataFrame to hold the sampled rows
decade1000_df = pd.DataFrame()

# Iterate over each decade
for decade in range(1920, 2020, 10):
    # Select 1000 random rows from the decade
    decade_rows = df.loc[(df['year'] >= decade) & (df['year'] < decade + 10)].sample(n=1000, random_state=42)
    # Add the selected rows to the sampled DataFrame
    decade1000_df = pd.concat([decade1000_df, decade_rows])

# Save the sampled DataFrame to a CSV file
decade1000_df.to_csv('decade1000_df.csv', index=False)

In [15]:
decade1000_df.shape

(10000, 4)

In [16]:
decade1000_df.head()

Unnamed: 0,year,title,excerpt,decade
658643,1926,"A new aspirant for Tom Mix's audiences, named ...",,1920
311947,1922,"LONDON, Jan. 30.--Ought girls to be allowed to...","Discharged for Lighting Cigarette in Office, G...",1920
431154,1923,J Oudegeest compares development in U S and Eu...,"sp art, with Oudegeest's por GAMPERS FAILS AS ...",1920
872465,1927,Statement on play by Chinese student to be pro...,""" Peking Politics"" Called First Effort by Nati...",1920
1212106,1928,CITY COLLEGE DRILLS AGAINST R.P.I. PLAYS,Varsity Is Nearly Scored on by Jayvees--Goldha...,1920


In [17]:
decade1000_df.tail()

Unnamed: 0,year,title,excerpt,decade
16843463,2014,“The Equalizer” stars Denzel Washington as a s...,,2010
17300839,2019,Six children were among the victims of a massa...,,2010
16857658,2015,The Interior Ministry said that more than 60 p...,,2010
16619756,2013,"With significant sums about to change hands, l...",,2010
17148359,2017,"A suspect in the attacks, Ahmed Abu Khattala, ...",,2010


#Combine title & exerpt into single col

In [18]:
df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project 4/Data/decade1000_df.csv')

In [19]:
df2.fillna('', inplace=True)

In [20]:
df2['Combined_Text'] = df2['title'] + ' ' + df2['excerpt']

df2.head()

Unnamed: 0,year,title,excerpt,decade,Combined_Text
0,1926,"A new aspirant for Tom Mix's audiences, named ...",,1920,"A new aspirant for Tom Mix's audiences, named ..."
1,1922,"LONDON, Jan. 30.--Ought girls to be allowed to...","Discharged for Lighting Cigarette in Office, G...",1920,"LONDON, Jan. 30.--Ought girls to be allowed to..."
2,1923,J Oudegeest compares development in U S and Eu...,"sp art, with Oudegeest's por GAMPERS FAILS AS ...",1920,J Oudegeest compares development in U S and Eu...
3,1927,Statement on play by Chinese student to be pro...,""" Peking Politics"" Called First Effort by Nati...",1920,Statement on play by Chinese student to be pro...
4,1928,CITY COLLEGE DRILLS AGAINST R.P.I. PLAYS,Varsity Is Nearly Scored on by Jayvees--Goldha...,1920,CITY COLLEGE DRILLS AGAINST R.P.I. PLAYS Varsi...


In [21]:
df2 = df2.drop(columns = ['title','excerpt'])
df2.head()

Unnamed: 0,year,decade,Combined_Text
0,1926,1920,"A new aspirant for Tom Mix's audiences, named ..."
1,1922,1920,"LONDON, Jan. 30.--Ought girls to be allowed to..."
2,1923,1920,J Oudegeest compares development in U S and Eu...
3,1927,1920,Statement on play by Chinese student to be pro...
4,1928,1920,CITY COLLEGE DRILLS AGAINST R.P.I. PLAYS Varsi...


In [22]:
# Save the sampled DataFrame to a CSV file
df2.to_csv('decade1000_combinedtxt.csv', index=False)

In [23]:
df3 = pd.read_csv('/content/decade1000_combinedtxt.csv')