In [6]:
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.ensemble import GradientBoostingRegressor  
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, mean_squared_error

nltk.download('stopwords')
nltk.download('punkt')
import string
import re


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
stopwords = stopwords.words("english")

In [8]:
raw_data = pd.read_csv("data/data.csv")
raw_data.head()

Unnamed: 0,book_id,text,birth_yr
0,84,"“And now, with the world before me, whither sh...",1797
1,84,"“You have been ill, very ill, and even the con...",1797
2,84,“I intended to reason. This passion is detrime...,1797
3,84,"“How is this? I must not be trifled with, and ...",1797
4,84,"“A few days after, the Turk entered his daught...",1797


In [9]:
def preprocess(text):
    text = re.sub(r'[^A-Za-z0-9]+', " ", text)
    text = text.lower()
    return text
raw_data["text"] = raw_data.get("text").apply(preprocess)
raw_data

Unnamed: 0,book_id,text,birth_yr
0,84,and now with the world before me whither shou...,1797
1,84,you have been ill very ill and even the const...,1797
2,84,i intended to reason this passion is detrimen...,1797
3,84,how is this i must not be trifled with and i ...,1797
4,84,a few days after the turk entered his daughte...,1797
...,...,...,...
8713,60693,alsint had evolved cells that were far more vi...,1915
8714,60693,one last thing he said remember that plants e...,1915
8715,60946,no instantaneous transmission of matter wasn t...,1933
8716,60946,gordus waved a fat hand in front of him laughi...,1933


In [13]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=50000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords)
tfidf

In [14]:
tfidf_array = tfidf.fit_transform(raw_data["text"]).toarray()
text_tfidf = pd.DataFrame(tfidf_array)
text_tfidf.columns = tfidf.get_feature_names_out()
text_tfidf



Unnamed: 0,0,00,000,005,01,0196,02,020,026,03,...,zur,zurich,zusamen,zusammen,zverkov,zvvanzig,zvvytusend,zwingle,zygomata,zygomatic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(text_tfidf, raw_data.get("birth_yr"), test_size = 0.2, random_state=1)


## SVM

In [None]:
clf = svm.SVC(kernel="linear")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

In [None]:
clf = svm.SVC(kernel="poly")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

In [27]:
clf = svm.SVC(kernel="rbf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

110.20692328691048

In [28]:
clf = svm.SVC(kernel="sigmoid")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

109.46449876935301

## KNN

In [20]:
rmse = 1000
best_i = 0
for i in range(1,21):
    print(i)
    knn = KNeighborsRegressor(n_neighbors=i)
    
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    var = mean_squared_error(y_test, y_pred, squared=False)
    if var < rmse:
        rmse = var
        best_i = i
print(rmse)
print(best_i)


99.5807504855189
8


In [None]:
most_unique = text_tfidf.idxmax(axis = 1)
most_unique

In [None]:
# Join all the text in the column into one string
text = " ".join(i for i in raw_data.text)

# Create a WordCloud object with some parameters
wordcloud = WordCloud(background_color="white", stopwords=stopwords, min_font_size=10).generate(text)

# Plot the word cloud using matplotlib
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()