### build 2D CNN (modified pattern from https://keras.io/examples/cifar10_cnn)

In [1]:
# import general dependencies
import ast          # new library: abstract syntax trees!
from matplotlib import pyplot
import numpy as np
import os
import pandas as pd

In [2]:
# import ML dependencies
from __future__ import print_function
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from tensorflow.keras.preprocessing import image
import lib.MakeCIFAR10StyleModel

Using TensorFlow backend.


### define parameters

In [3]:
# set params
strPath = '.'
strModel = 'CIFAR10Style_doctordata_MyCity.h5'

### import pictures as dataset for independent (X) vars

In [4]:
# create X data set
strImgPath = '../ek_scrape/img/doctordata/Online'
lstUsername = []
lstImg = []
for root, dirs, lstFile in os.walk(strImgPath):
    lstFile.sort()
    for strFile in lstFile:
        if strFile[-6:] == '00.png' and strFile != 'TraumaInTheUSA.00.png': # get first pic, and skip 1 bad rec
            print(strFile)
            lstUsername.append(strFile[:-7])
            img = pyplot.imread(strImgPath + '/' + strFile, format='jpg')
            arrImg = image.img_to_array(img)
            lstImg.append(arrImg)
arrX = np.array(lstImg)
arrX.shape

13D.00.png
1KindOfLove.00.png
1PrettyFeetChef1.00.png
1STREAD.00.png
1cyndi.00.png
210Frances.00.png
210shine.00.png
22aries74.00.png
2705alicia.00.png
2k18blessings.00.png
47Strings.00.png
4blonde4.00.png
512martinez.00.png
51Izzy.00.png
5Dallamakeyahalla.00.png
71NOexpectations.00.png
76Bluesky39.00.png
80ma.00.png
81vegasgirl.00.png
8bellaleah8.00.png
96jennifer.00.png
AEW1977.00.png
AGirlLooking.00.png
ATXgirl322.00.png
Adianne70.00.png
AkilaUno.00.png
Alejandra3.00.png
AlluringEJ.00.png
Amycast1.00.png
Andhereweareagain.00.png
Andi010274.00.png
Andreag11.00.png
Angel601.00.png
AngelbabyinSA.00.png
Annapblp.00.png
Armcandy13.00.png
AshTX.00.png
Aubie019.00.png
Austin6702.00.png
AydnilSantiago.00.png
Az18714.00.png
BBW43.00.png
Babe62.00.png
BabyGirlSA.00.png
BeHappy58.00.png
BeachSick.00.png
Bebepong29.00.png
Becca5013.00.png
BelieveInDestiny2009.00.png
Believedj.00.png
BellaHeart7777.00.png
Bellablonde1.00.png
Bindykins.00.png
Blkgrlnxtdr.00.png
BonnBonn1970.00.png
BossLadyCyn.00.

pamhenderson027.00.png
persianna.00.png
pinkintx.00.png
pinkstar813.00.png
pinky3850.00.png
pkutex06.00.png
plussizedsistah.00.png
pniqueen.00.png
pomk85.00.png
prcanno.00.png
prettygirl3085.00.png
prettylilthing512.00.png
princessandria.00.png
princesscharming.00.png
rachell1120.00.png
reddrover48.00.png
redinsa.00.png
rmartinez62.00.png
roadneed101.00.png
rockee74.00.png
rose197402.00.png
rubys1981.00.png
sabelle30.00.png
sammy716.00.png
sandrasatx.00.png
satxsaa26.00.png
scubakat7.00.png
sever1786.00.png
sexualred2.00.png
sexyme41.00.png
shadesofredd.00.png
shannadcc.00.png
sherrimiller228.00.png
shesliketexas.00.png
skylight84.00.png
sl2781.00.png
smbd75.00.png
smiley4uu4me.00.png
smiley7919.00.png
smmcou84.00.png
southaustinnicegirl1.00.png
southstar5.00.png
splatina.00.png
srort05.00.png
stefdref.00.png
stephgjmt.00.png
stillwaiting4unow.00.png
sugarg73.00.png
sungirltx.00.png
sunniatx.00.png
sunshinebae.00.png
susank2084.00.png
susierod56.00.png
sweetflower15.00.png
sweetie7290.

(572, 110, 110, 3)

In [5]:
# change lstUsername into dataframe--will be used to filter profiles later 
dfUsername = pd.DataFrame({'username':lstUsername})

### read Profile data

In [6]:
# read JSON from file. "pof_output.txt" encoding is Windows-style "cp1252"
strPath = "../ek_scrape/img/doctordata/Online/pof_output.txt"
with open(strPath, 'r', encoding = 'cp1252') as file:
    strFile = file.read()
    dictProfile = ast.literal_eval(ast.literal_eval(strFile).decode("utf-8"))

In [7]:
# AddFieldsFromDictCol() converts one dictionary-style column into multiple discrete columns
def AddFieldsFromDictCol(df, strDictCol):
    dfTemp = df[strDictCol].apply(pd.Series) # .apply(pd.Series) is my favorite method EVER
    for strCol in dfTemp.columns:
        if strCol.strip() not in df.columns:
            df[strCol.strip()] = dfTemp[strCol]
    return df

In [8]:
# dump all fields to dataframe
lstCol = ['username', 'headline', 'profile_info_1', 'profile_info_2', 'interests', 'about_me_text']
dfAll = pd.DataFrame.from_dict(dictProfile, orient = 'index', columns = lstCol)

In [None]:
# convert dictionary-style columns to discrete columns and drop the dictionary columns
# this causes some columns to be mis-labelled--that'll be fixed below
for strCol in ['profile_info_1', 'profile_info_2']:
    dfAll = AddFieldsFromDictCol(dfAll, strCol)
    dfAll = dfAll.drop(strCol, axis=1)

### reformat Profile data. Categorical cols will be tested as dependent (Y) vars

In [None]:
# drop all the "About" columns, records with NaN in "Details"
dfAll = dfAll[dfAll.columns.drop(list(dfAll.filter(regex='About.*')))]
dfAll = dfAll.dropna(subset=['Details'])

In [None]:
# build new fields from existing data
dfAll['Age'] = dfAll['Details'].str.strip().str[0:2].astype('int')
dfAll['Gender'] = dfAll['Details'].str.strip().str[12:13]
dfAll['Zodiac Sign'] = dfAll['Ethnicity'].str.split(', ').str[1]
dfAll['Ethnicity'] = dfAll['Ethnicity'].str.split(', ').str[0]
dfAll['Intent'] = dfAll['Intent'].str.replace('^\S+\s?', '')

In [None]:
# strip whitespace from text columns
for strCol in dfAll.columns:
    if dfAll[strCol].dtype == 'object':
        dfAll[strCol] = dfAll[strCol].str.strip()

In [None]:
# drop unneeded columns (some col labels are still shifted)
dfAll = dfAll[dfAll.columns.drop(['headline', 'City', 'interests', 'about_me_text',
                                  'Details', 'Profession', 'For', 'Chemistry',
                                  'Do you drink?', 'Hair Color', 'Gender', 'Interests'])]

In [None]:
# fix shifted column names
dfAll = dfAll.rename(columns={'Needs Test':'For', 'Do you want children?':'Do you drink?', 
                              'Marital Status':'Do you want children?', 'Do you do drugs?':'Marital Status', 
                              'Eye Color':'Hair Color', 'Do you have a car?':'Eye Color', 
                              'Do you have children?':'Do you have a car?', 
                              'Longest Relationship':'Do you have children?', 
                              'How ambitious are you?':'Longest Relationship', 
                              'Pets':'How ambitious are you?', 'Second Language':'Pets'})

In [None]:
# merge with dfUsername, and sort
dfFinal = pd.merge(dfAll, dfUsername, on='username')
dfFinal = dfFinal.sort_values(['username'])

In [None]:
dfFinal

Unnamed: 0,username,Ethnicity,Intent,Education,Personality,For,Do you drink?,Do you want children?,Marital Status,Hair Color,Eye Color,Do you have a car?,Do you have children?,Longest Relationship,How ambitious are you?,Pets,Age,Zodiac Sign
171,13D,Black,is actively seeking a relationship.,Some college,Free Thinker,Long Term,Socially,Does not want children,Single,Red,Brown,Yes,All my kids are over 18,Over 7 years,Ambitious,,50,Cancer
158,1KindOfLove,Caucasian,is actively seeking a relationship.,Some University,Animal Lover,Long Term,Socially,Does not want children,Single,Blond(e),Green,Yes,No,Over 7 years,Ambitious,,50,Capricorn
239,1PrettyFeetChef1,Caucasian,is actively seeking a relationship.,Bachelors Degree,Free Thinker,Long Term,Socially,Does not want children,Divorced,Blond(e),Green,Yes,All my kids are over 18,Over 10 years,Very Ambitious,,52,Capricorn
121,1STREAD,Black,is actively seeking a relationship.,Bachelors Degree,Athletic,Dating,Socially,Does not want children,Single,Mixed Color,Brown,Yes,All my kids are over 18,Over 4 years,Very Ambitious,,49,Capricorn
217,1cyndi,Caucasian,is looking for a relationship.,Associates Degree,Hopeless Romantic,Long Term,Socially,Prefer Not To Say,Divorced,Brown,Brown,Yes,Yes,Over 7 years,Ambitious,,56,Aquarius
191,210Frances,Hispanic,wants to date but nothing serious.,High school,Hopeless Romantic,Hang Out,Socially,Does not want children,Divorced,Brown,Hazel,Yes,All my kids are over 18,Over 10 years,Somewhat Ambitious,Dog,60,Aries
309,210shine,Hispanic,wants to date but nothing serious.,High school,Foodie,Hang Out,Socially,Does not want children,Single,Black,Brown,Yes,All my kids are over 18,Over 10 years,Not Ambitious,,39,Taurus
267,22aries74,Black,wants to date but nothing serious.,Bachelors Degree,Free Thinker,Hang Out,Socially,Does not want children,Single,Black,Brown,Yes,Yes,Over 10 years,Ambitious,,45,Aries
255,2705alicia,Hispanic,wants to find someone to marry.,Bachelors Degree,Adventurer,Long Term,Socially,Want children,Single,Black,Brown,Yes,No,Over 7 years,Somewhat Ambitious,No Pets,38,Gemini
490,2k18blessings,Black,is looking for a relationship.,Bachelors Degree,Poet,Long Term,Socially,Does not want children,Divorced,Black,Brown,Yes,All my kids are over 18,Over 10 years,Very Ambitious,,57,Capricorn


### start modeling

In [None]:
dfFinal['Ethnicity'].value_counts()

Caucasian          225
Hispanic           180
Black              112
Mixed Race          29
Other Ethnicity     14
Asian                7
Native American      4
Indian               1
Name: Ethnicity, dtype: int64

In [None]:
# define Y dataset
dfFinal['Ethnicity'] = dfFinal['Ethnicity'].astype('category')
dfFinal['Ethnicity_cat'] = dfFinal['Ethnicity'].cat.codes
arrY = dfFinal['Ethnicity_cat']

In [None]:
arrY.value_counts()

2    225
3    180
1    112
5     29
7     14
0      7
6      4
4      1
Name: Ethnicity_cat, dtype: int64

In [None]:
# create Train and Test datasets
arrTrainX, arrTestX, arrTrainY, arrTestY = train_test_split(arrX, arrY, random_state=17) # stratify=arrY)

In [None]:
# create model
model = lib.MakeCIFAR10StyleModel.Go(arrTrainX, arrTestX, arrTrainY, arrTestY)

Instructions for updating:
Colocations handled automatically by placer.
Not using data augmentation.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [None]:
# save model and weights
if not os.path.isdir(strPath):
    os.makedirs(strPath)
strModelPath = os.path.join(strPath, strModel)
model.save(strModelPath)
print('Saved trained model at %s ' % strModelPath)

In [None]:
# score trained model
arrScores = model.evaluate(arrTestX, arrTestY, verbose=1)
arrPredict = model.predict(arrTestX)
print('Test loss:', arrScores[0])
print('Test accuracy:', arrScores[1])

In [None]:
arrPredict