In [68]:
#Libraries
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

Section 1: Clean and Verify Data

In [122]:
#Read in Data
triviadata1 = pd.read_csv('C:\\Users\\shyar\\OneDrive\\Bootcamp\\Project3\\JEOPARDY_CSV.csv')

In [123]:
#check data structure
triviadata1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       213296 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216927 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [124]:
value_counts = triviadata1["Category"].value_counts()
toremove = value_counts[value_counts == 1].index
reduced_trivia = triviadata1[~triviadata1.Category.isin(toremove)]
reduced_trivia

KeyError: 'Category'

In [127]:
#View Sample of Data
triviadata1.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,12/31/2004,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,12/31/2004,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,12/31/2004,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,12/31/2004,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,12/31/2004,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [129]:
#preview dates
triviadata1[' Air Date'].value_counts()

 Air Date
5/19/1997     62
11/13/2007    62
11/1/2011     61
2/5/1993      61
2/15/2000     61
              ..
6/26/2000     30
1/27/2012     30
9/3/1996      30
10/21/1997    16
10/18/1989    12
Name: count, Length: 3640, dtype: int64

In [130]:
# Remove Leading Spaces from Column Names
triviadata1 =  triviadata1.rename(columns={"Show Number": "Show_Number",
                            " Air Date": "Air_Date",
                            " Round": "Round",
                            " Category": "Category",
                            " Value": "Value",
                            " Question": "Question",
                            " Answer": "Answer"})

In [131]:
#Convert Air Date to Date Format
triviadata1["Air_Date"] = pd.to_datetime(triviadata1.Air_Date)
triviadata1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Show_Number  216930 non-null  int64         
 1   Air_Date     216930 non-null  datetime64[ns]
 2   Round        216930 non-null  object        
 3   Category     216930 non-null  object        
 4   Value        213296 non-null  object        
 5   Question     216930 non-null  object        
 6   Answer       216927 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 11.6+ MB


In [134]:
value_counts = triviadata1["Category"].value_counts()
toremove = value_counts[value_counts <= 50].index
reduced_trivia = triviadata1[~triviadata1.Category.isin(toremove)]
reduced_trivia

Unnamed: 0,Show_Number,Air_Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way
11,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$400,Cows regurgitate this from the first stomach t...,the cud
12,4680,2004-12-31,Jeopardy!,HISTORY,$600,In 1000 Rajaraja I of the Cholas battled to ta...,Ceylon (or Sri Lanka)
...,...,...,...,...,...,...,...
216917,4999,2006-05-11,Double Jeopardy!,WORLD CAPITALS,"$3,400",Guyanese capital named for a Hanoverian monarch,Georgetown
216922,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,"$1,600","A motto of hers was ""in politics, if you want ...",(Margaret) Thatcher
216923,4999,2006-05-11,Double Jeopardy!,WORLD CAPITALS,"$2,000",It's on the Suriname River,Paramaribo
216928,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,"$2,000","From Ft. Sill, Okla. he made the plea, Arizona...",Geronimo


In [135]:
reduced_trivia["Category"].value_counts()

Category
BEFORE & AFTER         547
SCIENCE                519
LITERATURE             496
AMERICAN HISTORY       418
POTPOURRI              401
                      ... 
AWARDS & HONORS         51
HIGHWAYS & BYWAYS       51
ANCIENT GREECE          51
HISTORIC PEOPLE         51
FROM PAGE TO SCREEN     51
Name: count, Length: 332, dtype: int64

In [137]:
# Preview Categories
#pd.set_option("display.max_rows", None)
reduced_trivia["Category"].value_counts()

Category
BEFORE & AFTER         547
SCIENCE                519
LITERATURE             496
AMERICAN HISTORY       418
POTPOURRI              401
                      ... 
AWARDS & HONORS         51
HIGHWAYS & BYWAYS       51
ANCIENT GREECE          51
HISTORIC PEOPLE         51
FROM PAGE TO SCREEN     51
Name: count, Length: 332, dtype: int64

Section 2: Categorize Trivia

In [None]:
#Remove columns that will not add value
reduced_trivia = reduced_trivia.drop('Show_Number', axis=1)

In [138]:
#Concatinate 
reduced_trivia["Concat"]  = reduced_trivia["Category"] + " "+ reduced_trivia["Question"] + " " + reduced_trivia["Answer"]
reduced_trivia =  reduced_trivia.dropna()
reduced_trivia.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_trivia["Concat"]  = reduced_trivia["Category"] + " "+ reduced_trivia["Question"] + " " + reduced_trivia["Answer"]


Unnamed: 0,Show_Number,Air_Date,Round,Category,Value,Question,Answer,Concat
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"HISTORY For the last 8 years of his life, Gali..."
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,"3-LETTER WORDS In the title of an Aesop fable,..."
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,HISTORY Built in 312 B.C. to link Rome & the S...
11,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$400,Cows regurgitate this from the first stomach t...,the cud,3-LETTER WORDS Cows regurgitate this from the ...
12,4680,2004-12-31,Jeopardy!,HISTORY,$600,In 1000 Rajaraja I of the Cholas battled to ta...,Ceylon (or Sri Lanka),HISTORY In 1000 Rajaraja I of the Cholas battl...
17,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$600,"A small demon, or a mischievous child (who mig...",imp,"3-LETTER WORDS A small demon, or a mischievous..."
18,4680,2004-12-31,Jeopardy!,HISTORY,$800,Karl led the first of these Marxist organizati...,the International,HISTORY Karl led the first of these Marxist or...
23,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$800,"In geologic time one of these, shorter than an...",era,"3-LETTER WORDS In geologic time one of these, ..."
24,4680,2004-12-31,Jeopardy!,HISTORY,"$1,000",This Asian political party was founded in 1885...,the Congress Party,HISTORY This Asian political party was founded...
28,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,"$1,000","A single layer of paper, or to perform one's c...",ply,"3-LETTER WORDS A single layer of paper, or to ..."


In [139]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
tfidf

In [140]:
# Transform each row from trivia data to a DTM.
dtm = tfidf.fit_transform(reduced_trivia["Concat"])
# Get the shape of the DTM.
print(dtm.shape)

(41130, 11930)


In [None]:
#check for NA's
reduced_trivia["Concat"].isna().sum()


0

In [142]:
# Initialize the NMF and set the number of topics. 
nmf_model = NMF(n_components=5,random_state=42)
# Fit the model with our DTM data. 
nmf_model.fit(dtm)

In [143]:
# Check the length of the vocabulary 
len(tfidf.get_feature_names_out())

11930

In [144]:
# Print the top words for each topic
for index,topic in enumerate(nmf_model.components_):
    print(f'The top words for topic #{index+1}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-33:]])
    print('\n')



The top words for topic #1
['sarah', '01', 'jimmy', '09', '12', 'reports', '03', '04', '2009', '02', '10', '2010', '11', '2011', '07', '05', '2008', '2005', '06', '2007', '2006', 'seen', 'crew', 'clue', 'jpg', '_blank', 'target', 'media', 'href', 'http', 'com', 'www', 'archive']


The top words for topic #2
['old', 'game', 'french', 'describes', 'dictionary', 'phrase', 'precedes', 'small', 'origins', 'synonym', 'adjective', 'comes', 'person', 'type', 'like', 'greek', 'odd', 'used', 'mean', 'meaning', 'phrases', 'foreign', 'means', 'latin', 'term', '13', '12', 'perfect', 'add', '10', 'word', 'words', 'letter']


The top words for topic #3
['founded', 'bodies', 'national', 'mexico', 'travel', 'area', 'home', 'water', 'university', 'lies', 'york', 'countries', 'miles', 'north', 'sea', 'south', 'named', 'history', 'states', 'islands', 'lake', 'river', 'island', 'new', 'largest', 'cities', 'country', 'geography', 'capitals', 'capital', 'city', 'state', 'world']


The top words for topic #4


In [None]:
 # Use the add_topic_labels function to add the topic and topic label to each news summary. 
# Dictionary of  topics and topic label.
topic_labels = {
    1: 'Tech and Media',
    2: 'Literature',
    3: 'Geography',
    4: 'Culture',
    5: 'History'

}

In [None]:
#KNN clustering
# Define features set
X = app_data.copy()
X.drop("Result", axis=1, inplace=True);g


In [67]:
# Another type of Clustering
# Perform hierarchical clustering
dist_matrix = 1 - np.corrcoef(triviadata1['Category'].value_counts().values.reshape(1, -1).T)
Z = linkage(dist_matrix, method='ward')
n_clusters = 10

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


ValueError: The condensed distance matrix must contain only finite values.

In [None]:
# Perform hierarchical clustering
clusters = fcluster(Z, n_clusters, criterion='maxclust')

# Create a mapping of original categories to new clusters
category_mapping = {i: c for i, c in enumerate(clusters)}

# Add cluster information to the original dataframe
triviadata1['cluster'] = triviadata1['category'].map(category_mapping)

# Count questions per category
question_counts = triviadata1['category'].value_counts()

# Select top 10 categories based on number of questions
top_categories = question_counts.nlargest(10).index.tolist()

# Create final dataframe with selected categories
final_df = triviadata1[triviadata1['Category'].isin(top_categories)]

# Create a new column 'selected_category' with the top category names
final_df['selected_category'] = final_df['category']

# Add the mapping to the dataframe
final_df['category_mapping'] = final_df['cluster'].map(lambda x: list(category_mapping.keys())[list(category_mapping.values()).index(x)])

print(final_df)