In [1]:
# dependencies
import ast          # new library: abstract syntax trees!
import numpy as np
import pandas as pd
import pymongo
import re           # new library: regex!

In [2]:
# list of boring words to exclude from word frequency counts
lstBoring = ['--', 'a', 'am', 'an', 'and', 'are', 'as', 'at', 'b', 'be', 'but', \
             'by', 'c', 'd', 'e', 'for', 'how', 'if', 'in', 'into', 'is', 'it', \
             'it\'s', 'its', 'just', 'm', 'n', 'no', 'not', 'o', 'of', 'oh', 'on', \
             'or', 'out', 'r', 's', 'so', 't', 'that', 'that\'s', 'the', 'than', \
             'them', 'then', 'to', 'too', 'w', 'was', 'where', 'while', 'who', 'with', 'y']

In [3]:
# connect to mongo database "DataMate"
conn = 'mongodb://localhost:27017'
cli = pymongo.MongoClient(conn)
db = cli.DataMate

In [4]:
# read JSON from file. was "test_output.txt", is now "pof_output.txt"
# encoding was "utf-8", is now "cp1252"
strPath = "../scraping/pof_output.txt"
with open(strPath, 'r', encoding = 'cp1252') as file:
    strFile = file.read()
    dictProfile = ast.literal_eval(strFile)

In [5]:
# write Profiles to DataMate db collection "Profile"
for strKey, dictVal in dictProfile.items():
    if(strKey != '_id'):
        db.Profile.insert_one(dictVal)

In [6]:
# MakeWordFreqDf() makes a dataframe that totals word frequency across
# all rows for a specified list (eg, "interests")
# JSON looks like this:
#   {'match_0':{'username':'bob', 'hair':'blond', 'interests':['etc1', 'etc2']},
#    'match_1':{'username':'sam', 'hair':'brown', 'interests':['etc3', 'etc4']}
def MakeWordFreqDf(dictSource, strListName, fBreakChunks = False):
    # count words
    dictResult = {}
    for strKey, dictVal in dictSource.items(): # for each Profile in JSON
        if(strKey != '_id'):
            for strChunk in dictVal[strListName]:  # for each 
                dictResult = CountWords(strChunk.lower(), dictResult, fBreakChunks)
    # change dictResult into sorted dataframe
    df = pd.DataFrame.from_dict(dictResult, orient='index', columns=['count'])
    df = df.sort_values(['count'], ascending=False)
    # drop boring words and save as CSV
    df = DropBoringWords(df, lstBoring, strListName)
    df.to_csv(strListName + '.csv', index = True, header = True)
    print(f'• File "{strListName}.csv" saved')
    return df

In [7]:
# CountWords() 
def CountWords(strChunk, dict, fBreakChunks = False):
    # if fBreakChunks is true, break multi-word chunk (eg, "hi...let's") into 
    # words (eg, "hi" and "let's"). regex will split on all chars except letters, 
    # numbers, apostrophes, and hyphens
    if fBreakChunks:
        lst = re.split(r"[^a-zA-Z0-9'\-]+", strChunk)
    else:
        lst = [strChunk]
    for strWord in lst:
        if len(strWord) > 0:
            if strWord in dict:    # have we already logged this word?
                dict[strWord] += 1 # if yes: increment word's count
            else:
                dict[strWord] = 1  # if no: add row to dict
    return dict

In [8]:
# DropBoringWords() drops list of boring words from dataframe
def DropBoringWords(df, lst, strName):
    strLack = ', '
    for strWord in lst:
        try:
            df = df.drop([strWord])
        except Exception as e:
            strLack = strLack + strWord + ', ' 
    if strLack == ', ':
        print(f'• Dataframe "{strName}" had all the boring words. No longer.')
    else:
        print(f'• Dataframe "{strName}" lacked the boring words {strLack[2:-2]}.')
    return df

In [9]:
# make word-frequency dataframe from parsed "About Me" field and "Interests" field
dfAboutMeSplit = MakeWordFreqDf(dictProfile, 'about_me_split', True)
dfInterests = MakeWordFreqDf(dictProfile, 'interests', False)

• Dataframe "about_me_split" had all the boring words. No longer.
• File "about_me_split.csv" saved
• Dataframe "interests" lacked the boring words --, a, am, an, and, are, as, at, b, be, but, by, c, d, for, how, if, in, into, is, it, it's, its, just, m, no, not, of, oh, on, or, out, r, s, so, t, that, that's, the, than, them, then, to, too, w, was, where, while, who, with, y.
• File "interests.csv" saved


In [10]:
# query Mongo grouping by CHEP
dictGrpCHEP = {"$group":
               {"_id":
                {"city":"$profile_info_1.City",
                 "hair":"$profile_info_2.Hair Color",
                 "eyes":"$profile_info_2.Eye Color",
                 "pets":"$profile_info_2.Pets"},
                "count":{"$sum":1}}}
cursor = db.Profile.aggregate([dictGrpCHEP])

In [11]:
# populate dfCHEP from Mongo results
dfCHEP = pd.DataFrame(columns=['city', 'hair', 'eyes', 'pets'])
lst = [] # https://stackoverflow.com/a/17496530/8422614
for row in cursor:
    lst.append(row["_id"])
dfCHEP = pd.DataFrame(lst)

In [12]:
# create dfSHEP, replacing city field with state field
dfSHEP = dfCHEP
dfSHEP['state'] = dfCHEP['city'].str.replace('.*, ', '')
dfSHEP = dfSHEP.drop('city', axis=1)
dfSHEP = dfSHEP.fillna(value = {'pets': 'No Pets'})
dfSHEP['count'] = 1

In [13]:
# create group & dfSumm: S
grpS = dfSHEP.groupby(['state'])
dfSummS = grpS.sum()
dfSummS = dfSummS.reset_index()

In [14]:
# MakeSumm() makes a Summary table that shows percentages across the state
def MakeSumm(dfIn, lstFld):
    grp = dfIn.groupby(lstFld)
    dfOut = grp.sum()
    dfOut = dfOut.reset_index()
    dfOut = pd.merge(dfOut, dfSummS, on='state')
    dfOut['pct'] = dfOut['count_x'] / dfOut['count_y']
    dfOut = dfOut.rename(columns={'count_x':'rowcount', 'count_y':'statecount'})
    strFile = 'Summ_' + '_'.join(lstFld) + '.csv'
    dfOut.to_csv(strFile, index = True, header = True)
    print(f'• {strFile} saved.')
    return dfOut

In [15]:
# create group & dfSumm: S+H
dfSummSH = MakeSumm(dfSHEP, ['state', 'hair'])
# create group & df: S+E
dfSummSE = MakeSumm(dfSHEP, ['state', 'eyes'])
# create group & df: S+P
dfSummSP = MakeSumm(dfSHEP, ['state', 'pets'])
# create group & df: S+H+E+P
dfSummSHEP = MakeSumm(dfSHEP, ['state', 'hair', 'eyes', 'pets'])

• Summ_state_hair.csv saved.
• Summ_state_eyes.csv saved.
• Summ_state_pets.csv saved.
• Summ_state_hair_eyes_pets.csv saved.
