In [1]:
# dependencies
import ast          # new library: abstract syntax trees!
import pandas as pd
import pymongo
import re           # new library: regex!

In [2]:
# list of boring words to exclude from word frequency counts
lstBoring = ['--', 'a', 'am', 'an', 'and', 'are', 'as', 'at', 'b', 'be', 'but', \
             'by', 'c', 'd', 'e', 'for', 'how', 'if', 'in', 'into', 'is', 'it', \
             'it\'s', 'its', 'just', 'm', 'n', 'no', 'not', 'o', 'of', 'oh', 'on', \
             'or', 'out', 'r', 's', 'so', 't', 'that', 'that\'s', 'the', 'than', \
             'them', 'then', 'to', 'too', 'w', 'was', 'where', 'while', 'who', 'with', 'y']

In [3]:
# connect to mongo database "DataMate"
conn = 'mongodb://localhost:27017'
cli = pymongo.MongoClient(conn)
db = cli.DataMate

In [4]:
# read JSON from file. was "test_output.txt", is now "pof_output.txt"
# encoding was "utf-8", is now "cp1252"
strPath = "../scraping/pof_output.txt"
with open(strPath, 'r', encoding = 'cp1252') as file:
    strFile = file.read()
    dictProfile = ast.literal_eval(strFile)

In [5]:
# write Profiles to DataMate db table "Profile"
db.Profile.insert_many([dictProfile])

<pymongo.results.InsertManyResult at 0x1db8462fc48>

In [6]:
# MakeWordFreqDf() makes a dataframe that counts word frequency
# JSON looks like this:
#   {'match_0':{'username':'bob', 'hair':'blond', 'attr':'etc'},
#    'match_1':{'username':'sam', 'hair':'brown', 'attr':'etc'}}
def MakeWordFreqDf(dictSource, strAttr, fBreakChunks = False):
    dictResult = {}
    for strKey, dictVal in dictSource.items(): 
        if(strKey != '_id'):
            for strChunk in dictVal[strAttr]:
                dictResult = CountWords(strChunk.lower(), dictResult, fBreakChunks)
    df = pd.DataFrame.from_dict(dictResult, orient='index', columns=['count'])
    df = df.sort_values(['count'], ascending=False)
    df = DropBoringWords(df, lstBoring)
    df.to_csv(strAttr + '.csv', index = True, header = True)
    return df

In [7]:
# CountWords() breaks multi-word chunk (eg, "hi...let's") into words (eg, "hi" and "let's")
def CountWords(strChunk, dict, fBreakChunks = False):
    # split on all chars except letters, numbers, apostrophes, and hyphens
    if fBreakChunks:
        lst = re.split(r"[^a-zA-Z0-9'\-]+", strChunk)
    else:
        lst = [strChunk]
    for strWord in lst:
        if len(strWord) > 0:
            if strWord in dict:    # have we already logged this word?
                dict[strWord] += 1 # if yes: increment word's count
            else:
                dict[strWord] = 1  # if no: add row to dict
    return dict

In [8]:
# DropBoringWords() drops list of boring words from dataframe
def DropBoringWords(df, lst):
    strLack = ', '
    for strWord in lst:
        try:
            df = df.drop([strWord])
        except Exception as e:
            strLack = strLack + strWord + ', ' 
    if strLack == ', ':
        print('Dataframe had all the boring words. No longer.')
    else:
        print(f'Dataframe lacked the boring words {strLack[2:-2]}.')
    return df

In [9]:
# make word-frequency dataframe from parsed "About Me" field
dfAboutMe = MakeWordFreqDf(dictProfile, 'about_me_split', True)

Dataframe had all the boring words. No longer.


In [10]:
# make word-frequency dataframe from parsed "About Me" field
dfInterest = MakeWordFreqDf(dictProfile, 'interests', False)

Dataframe lacked the boring words --, a, am, an, and, are, as, at, b, be, but, by, c, d, for, how, if, in, into, is, it, it's, its, just, m, no, not, of, oh, on, or, out, r, s, so, t, that, that's, the, than, them, then, to, too, w, was, where, while, who, with, y.
