In [1]:
# Import Dependencies
import ast          # new library: abstract syntax trees!
import pandas as pd
import pymongo
import re           # new library: regex!
import string


In [2]:
# read JSON from file. "pof_output.txt" encoding is Windows-style "cp1252"
strPath = "Resources/pof_output_female.txt"
with open(strPath, 'r', encoding = 'cp1252') as file:
    strFilef = file.read()
    dictProfilef = ast.literal_eval(ast.literal_eval(strFilef).decode("utf-8"))

In [3]:
# AddFieldsFromDictCol() converts one dictionary-style column into multiple discrete columns
def AddFieldsFromDictCol(df, strDictCol):
    dfTemp = df[strDictCol].apply(pd.Series) # .apply(pd.Series) is my favorite method EVER
    for strCol in dfTemp.columns:
        if strCol.strip() not in df.columns:
            df[strCol.strip()] = dfTemp[strCol]
    return df

In [4]:
# dump all fields to dataframe
lstColf = ['username', 'headline', 'profile_info_1', 'profile_info_2', 'interests', 'about_me_text']
dfAllf = pd.DataFrame.from_dict(dictProfilef, orient = 'index', columns = lstColf)
# convert dictionary-style columns to discrete columns and drop the dictionary columns
for strColf in ['profile_info_1', 'profile_info_2']:
    dfAllf = AddFieldsFromDictCol(dfAllf, strColf)
    dfAllf = dfAllf.drop(strColf, axis=1)
# dump all recs lacking basic Details entry (eg: 66 year old Male, 5' 10" (178cm), Christian - other)
dfAllf = dfAllf.dropna(subset=['Details'], thresh = 1)
# handle NaNs and whitespace
dfAllf['Hair Color'] = dfAllf['Hair Color'].str.strip()
dfAllf['Pets'] = dfAllf['Pets'].fillna('No Pets')
dfAllf['Pets'] = dfAllf['Pets'].str.strip()

In [5]:
dfAllf

Unnamed: 0,username,headline,interests,about_me_text,About,City,Details,Ethnicity,Intent,Education,...,AboutCindie,AboutJoy,AboutJae,AboutMonie,AboutSue,AboutTerry,AboutNiqq,AboutLori,Aboutveronica,AboutKenda
match_0,ShortyFire1984,"looking for honest men, no games!",,..............................,Non-Smoker with Average body type,"Austin, Texas","34 year old Female, 5' 0"" (152cm), Christian -...","Caucasian, Sagittarius",ShortyFire1984 is looking for a relationship.,Bachelors Degree,...,,,,,,,,,,
match_1,ROXYCHIC549,Laugh Often. Live Longer... I'm Not An Upgrade...,,"'"" I am not an upgraded member. So if you want...",Non-Smoker with Average body type,"Austin, Texas","62 year old Female, 5' 3"" (160cm), Christian -...","Caucasian, Sagittarius",ROXYCHIC549 is looking for a relationship.,Bachelors Degree,...,,,,,,,,,,
match_10,tequilagirl69,Let's live life and....,,Laid back lovable fun leo looking for someone ...,Non-Smoker with A Few Extra Pounds body type,"Round rock, Texas","50 year old Female, 5' 2"" (157cm), Christian -...","Caucasian, Leo",tequilagirl69 is actively seeking a relationship.,Associates Degree,...,,,,,,,,,,
match_100,atxredneckgirl,Foxy...read profile 1st,"[LAKE, Music, Dance]",Tx country music junkie.No sickos w nasty pics...,Non-Smoker with Prefer Not To Say body type,"Austin, Texas","39 year old Female, 5' 4"" (163cm), Non-religious","Caucasian, Capricorn",atxredneckgirl is looking for a relationship.,Some college,...,,,,,,,,,,
match_101,mel12th,Who wants to chat,,I am here to look for people to converse with....,Non-Smoker with Big & Tall/BBW body type,"San antonio, Texas","34 year old Female, 5' 6"" (168cm), Non-religious","Hispanic, Virgo",mel12th wants to date but nothing serious.,Bachelors Degree,...,,,,,,,,,,
match_102,Andi010274,I'm so over this...,,Can't get into my account so I had to start over.,Non-Smoker with A Few Extra Pounds body type,"Bastrop, Texas","45 year old Female, 5' 2"" (157cm), Catholic","Caucasian, Capricorn",Andi010274 is looking for a relationship.,Some college,...,,,,,,,,,,
match_103,nodutch,.........shhhhh.ok ok I'm on here ...,,Music business goals meditation affirmation a ...,Non-Smoker with Athletic body type,"Austin, Texas","45 year old Female, 5' 7"" (170cm), Christian -...","Black, Taurus",nodutch is looking for a relationship.,Associates Degree,...,,,,,,,,,,
match_104,lauranmr,Did I catch your eye? Don’t be shy ;),"[Music, Arts, Animals]",Of all the dating profiles in all the world an...,Non-Smoker with Big & Tall/BBW body type,"Dale, Texas","50 year old Female, 5' 6"" (168cm), Non-religious","Caucasian, Taurus",lauranmr is looking for a relationship.,Some college,...,,,,,,,,,,
match_105,smbd75,"FWB need not apply, ty next...",,"Likes: fun, adventure, exciting.Dislikes: bori...",Non-Smoker with A Few Extra Pounds body type,"Austin, Texas","45 year old Female, 5' 6"" (168cm), Other","Other Ethnicity, Aquarius",smbd75 wants to date but nothing serious.,Bachelors Degree,...,,,,,,,,,,
match_106,candipie1989,Treat me like you want to be treated,,"Single black female, brown eyes and black hair...",Non-Smoker with Big & Tall/BBW body type,"Austin, Texas","48 year old Female, 5' 4"" (163cm), Baptist","Black, Virgo",candipie1989 is actively seeking a relationship.,Associates Degree,...,,,,,,,,,,


In [6]:
# build new fields from existing data
dfAllf['Age'] = dfAllf['Details'].str.strip().str[0:2].astype('int')
dfAllf['Gender'] = dfAllf['Details'].str.strip().str[12:13]
dfAllf['Zodiac Sign'] = dfAllf['Ethnicity'].str.split(', ').str[1]
dfAllf['Ethnicity'] = dfAllf['Ethnicity'].str.split(', ').str[0]

In [7]:
# remove special characters from 'about me text'
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    return s

dfAllf['about_me_text_clean']= dfAllf['about_me_text'].apply(remove_punctuation)

In [8]:
# export all data to CSV
dfAllf.to_csv('allFemale.csv')

In [9]:
# read JSON from file. "pof_output.txt" encoding is Windows-style "cp1252"
strPath = "Resources/pof_output_male.txt"
with open(strPath, 'r', encoding = 'cp1252') as file:
    strFilem = file.read()
    dictProfilem = ast.literal_eval(ast.literal_eval(strFilem).decode("utf-8"))

In [10]:
# AddFieldsFromDictCol() converts one dictionary-style column into multiple discrete columns
def AddFieldsFromDictCol(df, strDictCol):
    dfTemp = df[strDictCol].apply(pd.Series) # .apply(pd.Series) is my favorite method EVER
    for strCol in dfTemp.columns:
        if strCol.strip() not in df.columns:
            df[strCol.strip()] = dfTemp[strCol]
    return df

In [11]:
# dump all fields to dataframe
lstColm = ['username', 'headline', 'profile_info_1', 'profile_info_2', 'interests', 'about_me_text']
dfAllm = pd.DataFrame.from_dict(dictProfilem, orient = 'index', columns = lstColm)
# convert dictionary-style columns to discrete columns and drop the dictionary columns
for strColm in ['profile_info_1', 'profile_info_2']:
    dfAllm = AddFieldsFromDictCol(dfAllm, strColm)
    dfAllm = dfAllm.drop(strColm, axis=1)
# dump all recs lacking basic Details entry (eg: 66 year old Male, 5' 10" (178cm), Christian - other)
dfAllm = dfAllm.dropna(subset=['Details'], thresh = 1)
# handle NaNs and whitespace
dfAllm['Hair Color'] = dfAllm['Hair Color'].str.strip()
dfAllm['Pets'] = dfAllm['Pets'].fillna('No Pets')
dfAllm['Pets'] = dfAllm['Pets'].str.strip()

In [12]:
# build new fields from existing data
dfAllm['Age'] = dfAllm['Details'].str.strip().str[0:2].astype('int')
dfAllm['Gender'] = dfAllm['Details'].str.strip().str[12:13]
dfAllm['Zodiac Sign'] = dfAllm['Ethnicity'].str.split(', ').str[1]
dfAllm['Ethnicity'] = dfAllm['Ethnicity'].str.split(', ').str[0]

In [13]:
# remove special characters from 'about me text'
dfAllm['about_me_text_clean']= dfAllm['about_me_text'].apply(remove_punctuation)

In [14]:
# export all data to CSV
dfAllm.to_csv('allMale.csv')

In [15]:
# concatenate male and female dataframe
dfAll = dfAllf.append(dfAllm, ignore_index=True, sort=True)

In [16]:
# select only needed columns
dfAll = dfAll[['username','Gender', 'Education', 'Ethnicity', 'about_me_text_clean']]

In [17]:
# save combined csv to folder
dfAll.to_csv('all.csv')