In [1]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/crystal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_pickle("sentiment_lg.pkl")

In [3]:
def clean_text(text, tokenizer, stopwords):

    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [5]:
text = df.copy()
text = text[["text_clean"]]

In [6]:
text

Unnamed: 0,text_clean
0,chicago ap chicago school leaders canceled...
1,chicago ap chicago school leaders canceled...
2,chicago pd lies season episode pro...
3,wise of foolish the chicago tribune article t...
4,the neighbors near mashawn plummer s portage p...
...,...
181866,authorities say a girl who was shot in the hea...
181867,the no seed loyola chicago ramblers ...
181868,remember to check out the draftkings illinois ...
181869,buyer address valuation jennifer a ...


In [7]:
stopwords = stopwords.words("english")
text["tokens"] = text["text_clean"].map(lambda x: clean_text(x, word_tokenize, stopwords))

In [8]:
import gensim.downloader as api

In [14]:
#wv = api.load('word2vec-google-news-300')

In [70]:
wv.most_similar("resident")

[('resi_dent', 0.6230902671813965),
 ('lifelong_resident', 0.602915346622467),
 ('Resident', 0.6022466421127319),
 ('residents', 0.5787309408187866),
 ('retired_schoolteacher', 0.5495061278343201),
 ('schoolteacher', 0.50212562084198),
 ('citizen', 0.49899375438690186),
 ('councilman', 0.4956853985786438),
 ('Crumland_Farms', 0.4863513708114624),
 ('Perne', 0.48625093698501587)]

In [71]:
wv.most_similar("population")

[('populations', 0.7653926610946655),
 ('poulation', 0.6552072167396545),
 ('popula_tion', 0.6456001996994019),
 ('popu_lation', 0.5914971828460693),
 ('populace', 0.5719559788703918),
 ('pop_ulation', 0.570850670337677),
 ('inhabitants', 0.5527059435844421),
 ('populaton', 0.5474691987037659),
 ('Population', 0.5354834198951721),
 ('Demographers_predict', 0.5321643948554993)]

In [42]:
wv.most_similar("business")


[('businesses', 0.6623775362968445),
 ('busines', 0.6080313920974731),
 ('busi_ness', 0.5612965226173401),
 ('PETER_PASSI_covers', 0.5530025959014893),
 ('Business', 0.546613872051239),
 ('businesss', 0.5441080331802368),
 ('Sopris_supplemental_solutions', 0.525254487991333),
 ('company', 0.5192003846168518),
 ('entrepreneurial', 0.5077816247940063),
 ('buiness', 0.5039401650428772)]

In [15]:
docs = text["text_clean"].values
tokenized_docs = text["tokens"].values

In [43]:
def vectorize(list_of_docs):
    
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(wv.vector_size)
        vectors = []
        for token in tokens:
            if token in wv:
                try:
                    vectors.append(wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs)
#len(vectorized_docs), len(vectorized_docs[0])

In [65]:
def vectorize(list_of_docs):
    
    features = []

    for tokens in list_of_docs:
        vectors = []
        for token in tokens:
            if token in wv:
                try:
                    vectors.append(wv.similarity("resident",token))
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean()
            features.append(avg_vec)
        else:
            features.append(0)
    return features
    
vectorized_docs = vectorize(tokenized_docs)

In [66]:
len(vectorized_docs)

181871

In [None]:
model_google.similarity('chicago', 'university')

In [52]:
len(wv["population"])

300

In [72]:
temp = df[["title","text","population","business","resident"]]
temp

Unnamed: 0,title,text,population,business,resident
0,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,0.094708,0.089997,0.068338
1,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,0.093367,0.090444,0.068236
2,Watch ‘Chicago P.D.’ Preview Wednesday,Chicago PD 9×11 “Lies” Season 9 Episode 11 Pro...,0.067550,0.102209,0.044861
3,Trump’s Solution for Chicago Public Schools? K...,Wise of Foolish?\nThe Chicago Tribune article ...,0.104678,0.095248,0.058324
4,"Family, friends, colleagues gathering today at...",The neighbors near Mashawn Plummer’s Portage P...,0.061630,0.076524,0.123947
...,...,...,...,...,...
181866,Chicago girl shot after celebrating 12th birth...,Authorities say a girl who was shot in the hea...,0.089800,0.061380,0.136115
181867,Loyola Chicago vs. Northern Iowa – MVC Tournam...,"The No. 4 seed Loyola Chicago Ramblers (23-7, ...",0.078802,0.049178,0.026299
181868,"DraftKings Illinois Promo Code: $1,050 Bonus a...",Remember to check out the DraftKings Illinois ...,0.038728,0.078272,0.004489
181869,"Sidney, Illinois had a median home valuation o...",[BlockShopper.com] .\n|Buyer||Address||Valuati...,0.064628,0.125551,0.071950


In [67]:

df["resident"] = vectorized_docs
df

Unnamed: 0,date,language,title,text,text_clean,cluster,topic1,w1,topic2,w2,topic3,w3,Topic_cluster,sentiment,sentiment_prob_0,sentiment_prob_1,population,business,resident
0,2022-01-06,english,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,chicago ap chicago school leaders canceled...,0,1,0.998010,,,,,7,0,1.000000,5.045681e-16,0.094708,0.089997,0.068338
1,2022-01-06,english,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,chicago ap chicago school leaders canceled...,0,1,0.998145,,,,,10,0,1.000000,4.040775e-19,0.093367,0.090444,0.068236
2,2022-01-06,english,Watch ‘Chicago P.D.’ Preview Wednesday,Chicago PD 9×11 “Lies” Season 9 Episode 11 Pro...,chicago pd lies season episode pro...,0,6,0.689289,0.0,0.214598,17.0,0.061193,4,1,0.246794,7.532063e-01,0.067550,0.102209,0.044861
3,2022-01-06,english,Trump’s Solution for Chicago Public Schools? K...,Wise of Foolish?\nThe Chicago Tribune article ...,wise of foolish the chicago tribune article t...,0,12,0.457139,1.0,0.322744,6.0,0.075293,4,0,1.000000,5.159463e-09,0.104678,0.095248,0.058324
4,2022-01-06,english,"Family, friends, colleagues gathering today at...",The neighbors near Mashawn Plummer’s Portage P...,the neighbors near mashawn plummer s portage p...,0,6,0.569758,2.0,0.336124,3.0,0.080211,1,1,0.004937,9.950635e-01,0.061630,0.076524,0.123947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181866,2022-03-05,english,Chicago girl shot after celebrating 12th birth...,Authorities say a girl who was shot in the hea...,authorities say a girl who was shot in the hea...,0,2,0.898170,0.0,0.013380,1.0,0.010981,4,0,0.814880,1.851205e-01,0.089800,0.061380,0.136115
181867,2022-03-05,english,Loyola Chicago vs. Northern Iowa – MVC Tournam...,"The No. 4 seed Loyola Chicago Ramblers (23-7, ...",the no seed loyola chicago ramblers ...,0,15,0.662747,3.0,0.177199,14.0,0.083335,2,1,0.000028,9.999719e-01,0.078802,0.049178,0.026299
181868,2022-03-05,english,"DraftKings Illinois Promo Code: $1,050 Bonus a...",Remember to check out the DraftKings Illinois ...,remember to check out the draftkings illinois ...,0,10,0.451471,16.0,0.335602,13.0,0.176749,4,0,0.414387,5.856135e-01,0.038728,0.078272,0.004489
181869,2022-03-05,english,"Sidney, Illinois had a median home valuation o...",[BlockShopper.com] .\n|Buyer||Address||Valuati...,buyer address valuation jennifer a ...,0,11,0.924808,18.0,0.033204,12.0,0.028433,8,1,0.559324,4.406761e-01,0.064628,0.125551,0.071950


In [32]:
len(vectorized_docs),len(vectorized_docs[0])

(181871, 300)

In [None]:
#df["population"] = mean_relative

In [69]:
df.to_pickle("data_with_similarity.pkl")