In [None]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline
# General libraries.
import pandas as pd 
import re
import numpy as np
import matplotlib.pyplot as plt
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
from sklearn.cluster import KMeans

import nltk
from tqdm import tqdm

In [None]:
df = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w266/266_final/nyt_data_collection/dataset/train_dataset.csv')[:1000]
df.head()

In [None]:
## creating functions
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import string


def lemmatize_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = WordNetLemmatizer()
    lem_text = []
    for w in w_tokenizer.tokenize(text):
        lem_text.append(lemmatizer.lemmatize(w))
        lem_text.append(" ")
    return ''.join(lem_text)


def clean_text(text):
    """ Takes Text and does the following
    1. Remove Stopwords - remove common stopwords in English
    2. Removing Numbers -- may want to revisit this if numbers are important (thinking like covid cases and such)
    3. Lemmatizes Text - revert word to its base form (ex. studies, studying to study)
     """
    text = str(text)
    if len(text) <1:
        # if no available first paragraph
        return " "
    else:
        no_numbers = re.sub(r'[0-9]', "", text)
        no_punct = re.sub(r'[^\w\s]', '', no_numbers)
    # no_punct = [char for char in no_numbers if char not in string.punctuation]
    # no_punct = " ".join(no_punct)
    
    # no_punct = "".join(no_numbers)
    
        lower_text = [word.lower() for word in no_punct.split()
                  if word not in stopwords.words("english")]
        lower_text = ' '.join(lower_text)
        lemm_text = lemmatize_text(lower_text)
        if len(lemm_text) < 1:
        # in empty after regex
            return " "
        else:
            return lemm_text

In [None]:
text = "There are 4572 students in CS61B and CS1A."
no_numbers = re.sub(r'[0-9]', "", text)
no_punct = re.sub(r'[^\w\s]', '', no_numbers)
lower_text = [word.lower() for word in no_punct.split()
                if word not in stopwords.words("english")]
lower_text = ' '.join(lower_text)
lemm_text = lemmatize_text(lower_text).strip()
lemm_text

In [None]:
## cleaning function
df['cleaned_first_paragraph'] = df['first_paragraph'].apply(clean_text)
df.head()

In [None]:
def compare_clean(idx):
    print(f"Row Number: {idx}\n")
    print(f"Original text: \n {df.first_paragraph[idx]} \n")
    print(f"Cleaned text: \n {df.cleaned_first_paragraph[idx]}")

In [None]:
## Document Clustering using TFIDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.cleaned_first_paragraph.to_list())

In [None]:
## Output of first 10 feature names from small training sample
print(vectorizer.get_feature_names()[:10])

In [None]:
## Clustering
kmeans = KMeans(n_clusters=4)
kmeans_fit = kmeans.fit(X)
kmeans_fit

In [None]:
SSD = []
K = range(1,50)
for k in tqdm(K):
    km = KMeans(n_clusters = k)
    km = km.fit(X)
    SSD.append(km.inertia_)

In [None]:
import matplotlib.pyplot as pyplot
plt.plot(K, SSD, "bx-")
plt.xlabel('k')
plt.ylabel("Sum of Squared Distances")
plt.title('Elbow Method For Optimal Clusters')

In [None]:
matching = [s for s in keywords if any(xs in s for xs in ['Coronavirus'])]

In [None]:
keyword_vectorizer = TfidfVectorizer()
X_keywords = vectorizer.fit_transform(covid_df.cleaned_first_paragraph.to_list())

In [None]:
SSD_keywords = []
K_keywords = range(1,100)
for k in tqdm(K_keywords):
    km = KMeans(n_clusters = k)
    km = km.fit(X)
    SSD_keywords.append(km.inertia_)

In [None]:
plt.plot(K_keywords, SSD_keywords, "bx-")
plt.xlabel('k')
plt.ylabel("Sum of Squared Distances")
plt.title('Elbow Method For Optimal Clusters')

# Subsetting Data

In [None]:
df = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w266/266_final/nyt_data_collection/dataset/full_nyt_dataset.csv')

In [None]:
def data_subset(df, keyword, column = "first_paragraph"):
    df = df.dropna(subset=[column])
    subset = df[df[column].str.lower().str.contains(keyword)]
    return subset

In [None]:
keywords = []
for i in range(len(df.keywords)):
    cleaned_row = df.keywords[i].replace('[','')
    cleaned_row = cleaned_row.replace(']','')
    cleaned_row = cleaned_row.replace("'",'')
    keywords.extend(cleaned_row.split(","))

In [None]:
df= df.dropna(subset=['first_paragraph'])

In [None]:
covid_df = df[df['first_paragraph'].str.lower().str.contains('coronavirus')]
covid_df

In [None]:
covid_df.to_csv('../nyt_data_collection/fp_covid_articles.csv', index = False)

In [None]:
data_subset(df, "coronavirus")

In [None]:
covid_df = df[df['keywords'].str.lower().str.contains('coronavirus')]
covid_df

In [None]:
covid_df.to_csv('../nyt_data_collection/covid_articles.csv', index = False)

In [None]:
keywords = list(set(keywords))

In [None]:
oil_df = df[df['keywords'].str.lower().str.contains('oil')]
oil_df

In [None]:
oil_df.to_csv('../nyt_data_collection/oil_articles.csv', index = False)

In [None]:
golf_df = df[df['keywords'].str.lower().str.contains('masters golf')]
golf_df

In [None]:
golf_df.to_csv('../nyt_data_collection/golf_articles.csv', index = False)