In [1]:
from gensim.models import Word2Vec
import networkx as nx
from wordcloud import STOPWORDS
import nltk
from nltk.probability import FreqDist
import string
import re

In [2]:
from google.colab import files
# Create data on to Google Drive
from google.colab import drive
# Mount your Drive to the Colab VM.
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
processedTxtPath = "/gdrive/MyDrive/06_ai/assets/native-seattle-processed.txt"

In [4]:

# load the dataset
print("loading text data...")
txt = open(processedTxtPath, "r", encoding="utf8").read()

# Convert text to lowercase
txt = txt.lower()
# Remove numbers
txt = re.sub(r'\d+', '', txt)

# Remove punctuation
txt = re.sub(r'[^\w\s]', '', txt)

# delete the white spaces
# https://www.journaldev.com/23763/python-remove-spaces-from-string#python-remove-whitespaces-from-string-using-regex
txt = " ".join(txt.split())
txt.translate({ord(c): None for c in string.whitespace})
txt = txt.replace("gays", "gay").replace("lesbians", "lesbian").replace("seattles", "seattle").replace("citys", "city")
stopwords = set(STOPWORDS)
commonwords = {"time", "one", "began", "among", "another", "see", "part", "many", "day", "day", "way", "times",
               "still", "news", "three", "came", "became", "made", "wanted", "seemed", "made", "now", "society",
               "ing", "time", "first", "new", "called", "said", "come", "two", "city", "group", "state", "year",
               "case", "member", "even", "later", "month", "years", "much", "week", "county", "name", "example"
               "well", "members", "us", "say", "s"}
stopwords.update(commonwords)

# tokenize and calculate the word frequencies
nltk.download('punkt')
tokens = nltk.tokenize.word_tokenize(txt)
fDist = FreqDist(tokens)
# print(fDist.most_common(20))

# remove the stop words and common words
filtered_fDist = nltk.FreqDist(dict((word, freq) for word, freq in fDist.items() if word not in stopwords))


# print(words)
# words.remove("example")
# words.remove("told")
# words.remove("become")
# words.remove("well")
# words.remove("may")
# words.remove("june")
# words.remove("homosexuals")


loading text data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
print('loading model...')
model = Word2Vec.load("/gdrive/MyDrive/06_ai/assets/native-seattle.w2v")
g = nx.DiGraph()
items = filtered_fDist.most_common(50)
for item in items:
    g.add_nodes_from(item[0])
    try:
        mswords = model.wv.most_similar(item[0], topn=25)
        for msword in mswords:
            g.add_nodes_from(msword[0])
            g.add_edge(item[0], msword[0], weight=msword[1])
            print("%s --> %s: %8.5f" % (item[0], msword[0], msword[1]))
    except KeyError as error:
        print(error)

nx.write_gexf(g, "/gdrive/MyDrive/06_ai/assets/native-seattle.gexf", encoding='utf-8', prettyprint=True, version='1.1draft')
print("finished!")

loading model...
seattle --> meanwhile:  0.99988
seattle --> historical:  0.99987
seattle --> family:  0.99987
seattle --> water:  0.99986
seattle --> lawton:  0.99986
seattle --> day:  0.99986
seattle --> fort:  0.99986
seattle --> several:  0.99986
seattle --> area:  0.99985
seattle --> march:  0.99985
seattle --> foot:  0.99984
seattle --> times:  0.99984
seattle --> harrington:  0.99984
seattle --> recalled:  0.99983
seattle --> may:  0.99983
seattle --> mary:  0.99983
seattle --> shop:  0.99983
seattle --> streets:  0.99983
seattle --> american:  0.99983
seattle --> go:  0.99983
seattle --> county:  0.99982
seattle --> tion:  0.99982
seattle --> village:  0.99982
seattle --> cedar:  0.99981
seattle --> tlingit:  0.99981
indian --> places:  0.99992
indian --> local:  0.99992
indian --> would:  0.99991
indian --> many:  0.99990
indian --> place:  0.99987
indian --> settlers:  0.99985
indian --> duwamish:  0.99982
indian --> women:  0.99981
indian --> community:  0.99981
indian --> f