In [1]:
import coins
import time

# Loading and first Preprocessing of raw Data, Creation of customized DataFrames

In [2]:
# LOAD RAW DATA
ipip = coins.io.getPreprocessedRawData('ipip')
mpzm = coins.io.getPreprocessedRawData('mpzm')
mood = coins.io.getPreprocessedRawData('mood')
images = coins.io.getPreprocessedRawData('images')
sessions = coins.io.getPreprocessedRawData('sessions')
imageLabels = coins.io.getPreprocessedRawData('imageLabels')

In [3]:
# BUILD DEFINED DATAFRAMES
dfPersonality = coins.dfcreation.createPersonality(ipip)
dfImageDescriptions = coins.dfcreation.createImageDescriptions(images)
dfImageRatings = coins.dfcreation.createImageRatings(images)
dfSocioDemographics = coins.dfcreation.createSocioDemographics(ipip, mpzm, ipip, mood)
dfImageLabels = imageLabels

In [4]:
# SAVE DEFINED DATAFRAMES
coins.io.saveInitialDFs(dfPersonality, 'personality')
coins.io.saveInitialDFs(dfImageDescriptions, 'imageDescriptions')
coins.io.saveInitialDFs(dfImageRatings, 'imageRatings')
coins.io.saveInitialDFs(dfSocioDemographics, 'socioDemographics')
coins.io.saveInitialDFs(dfImageLabels, 'imageLabels')

# Special Preparation of dfImageDescriptions

In [6]:
# LOAD DATAFRAME
dfImageDescriptions = coins.io.loadInitialDFs('imageDescriptions')

In [None]:
# OPTION 1: Analyze and save imageDescriptions (needs credentials, costs money)
dfImageDescriptions = translateToEnglish(dfImageDescriptions)
dfImageDescriptions = coins.nluTranslation.analyzeEnglishSentimentAndEmotions(dfImageDescriptions)
dfImageDescriptions = coins.nluTranslation.fillImageDescriptions(dfImageDescriptions)
coins.io.savePreparedImageDescriptions(dfImageDescriptions)

In [4]:
# OPTION 2: Load existing analyzed imageDescriptions
dfImageDescriptions = coins.io.loadPreparedImageDescriptions()

# Check for significant Correlations between all 5 DataFrames

In [2]:
# LOAD DATAFRAMES
dfPersonality = coins.io.loadInitialDFs('personality')
dfImageRatings = coins.io.loadInitialDFs('imageRatings')
dfSocioDemographics = coins.io.loadInitialDFs('socioDemographics')
dfImageLabels = coins.io.loadInitialDFs('imageLabels')
dfImageDescriptions = coins.io.loadAnalyzedImageDescriptions()

In [3]:
# PREPARE DATAFRAME VALUES
dfPersonality = coins.correlation.preparePersonality(dfPersonality, multiclass=False, split='mean')
dfImageDescriptions = coins.correlation.prepareImageDescriptions(dfImageDescriptions, multiclass=False, split='median')
dfSocioDemographics, dropList = coins.correlation.prepareSocioDemographics(dfSocioDemographics, 1)

In [4]:
# CALCULATE CORRELATIONS AND P-VALUES
_, _, _, pPersonalitySocioDemographics, cPersonalitySocioDemographics = coins.correlation.calculateCorrWithPValue(dfPersonality, dfSocioDemographics)
_, _, _, pPersonalityImageDescriptions, cPersonalityImageDescriptions = coins.correlation.calculateCorrWithPValue(dfPersonality, dfImageDescriptions)
_, _, _, pPersonalityImageRatings, cPersonalityImageRatings = coins.correlation.calculateCorrWithPValue(dfPersonality, dfImageRatings)
_, _, _, pSocioDemographicsImageDescriptions, cSocioDemographicsImageDescriptions = coins.correlation.calculateCorrWithPValue(dfSocioDemographics, dfImageDescriptions)
_, _, _, pSocioDemographicsImageRatings, cSocioDemographicsImageRatings = coins.correlation.calculateCorrWithPValue(dfSocioDemographics, dfImageRatings)
_, _, _, pImageDescriptionsImageRatings, cImageDescriptionsImageRatings = coins.correlation.calculateCorrWithPValue(dfImageDescriptions, dfImageRatings)

In [5]:
# SAVE CORRELATIONS AND P-VALUES
coins.io.saveCorrelations(cPersonalitySocioDemographics, pPersonalitySocioDemographics, 'personality_socioDemographics')
coins.io.saveCorrelations(cPersonalityImageDescriptions, pPersonalityImageDescriptions, 'personality_imageDescriptions')
coins.io.saveCorrelations(cPersonalityImageRatings, pPersonalityImageRatings, 'personality_imageRatings')
coins.io.saveCorrelations(cSocioDemographicsImageDescriptions, pSocioDemographicsImageDescriptions, 'socioDemographics_imageDescriptions')
coins.io.saveCorrelations(cSocioDemographicsImageRatings, pSocioDemographicsImageRatings, 'imageRatings_socioDemographics')
coins.io.saveCorrelations(cImageDescriptionsImageRatings, pImageDescriptionsImageRatings, 'imageDescriptions_imageRatings')

# Find best Predictions

In [7]:
# LOAD DATAFRAMES
dfPersonality = coins.io.loadInitialDFs('personality')
dfImageRatings = coins.io.loadInitialDFs('imageRatings')
dfSocioDemographics = coins.io.loadInitialDFs('socioDemographics')
dfImageLabels = coins.io.loadInitialDFs('imageLabels')
dfImageDescriptions = coins.io.loadPreparedImageDescriptions()

In [8]:
# PREPARE DATAFRAME VALUES
dfPersonality = coins.correlation.preparePersonality(dfPersonality, multiclass=False, split='mean')
dfImageDescriptions = coins.correlation.prepareImageDescriptions(dfImageDescriptions, multiclass=False, split='median')
dfSocioDemographics, dropList = coins.correlation.prepareSocioDemographics(dfSocioDemographics, 1)

In [12]:
start_time = time.time()
result = coins.classification.findBestClassifier([dfPersonality, dfImageDescriptions], dfSocioDemographics)
end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))
result

completed: registration_age
completed: job_status_edu_parttime
completed: job_status_edu_fulltime
completed: job_status_employed_parttime
completed: job_status_employed_fulltime
completed: job_status_selfemployed
completed: job_status_houskeeping
completed: job_status_unemployed
completed: gender_f
completed: gender_m
completed: registration_ageKat_10-19
completed: registration_ageKat_20-29
completed: registration_ageKat_30-39
completed: registration_ageKat_40-49
completed: registration_ageKat_50-59
completed: country_AT
completed: country_CH
completed: country_DE
completed: country_IT
completed: work_country_CH
completed: work_country_DE
completed: work_district_AG
completed: work_district_BE
completed: work_district_BL
completed: work_district_BS
completed: work_district_GR
completed: work_district_HE
completed: work_district_LU
completed: work_district_NI
completed: work_district_NW
completed: work_district_SG
completed: work_district_SO
completed: work_district_TG
completed: work_d

Unnamed: 0,TargetFeature,InputFeature,BestAlgorithm,R^2,Accuracy,Model,PCA,Standard Scaler
0,registration_age,reasons_translation_joyCategory| reasons_trans...,Gaussian Naive Bayes,-0.100981,0.178571,"GaussianNB(priors=None, var_smoothing=1e-09)","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
1,job_status_edu_parttime,neurotizismusCategory| extraversionCategory| g...,Logistic Regression,-0.555556,0.642857,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
2,job_status_edu_fulltime,no input feature with p-value below 0.05,-,-,-,-,-,-
3,job_status_employed_parttime,no input feature with p-value below 0.05,-,-,-,-,-,-
4,job_status_employed_fulltime,strengths_translation_sadnessCategory| strengt...,Logistic Regression,-0.166667,0.714286,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
...,...,...,...,...,...,...,...,...
65,"educational_achievement_Gymnasiale Maturität, ...",no input feature with p-value below 0.05,-,-,-,-,-,-
66,educational_achievement_Höhere Fach- und Beruf...,gewissenhaftigkeitCategory| story_translation_...,Logistic Regression,-0.333333,0.75,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
67,educational_achievement_Höhere Fachschule lusi...,utilization_translation_joyCategory,Logistic Regression,-0.0769231,0.928571,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
68,"educational_achievement_Master, Lizentiat, Dip...",emotions_translation_joyCategory| strengths_tr...,"KNN Classifier, Degree: 6",-0.166667,0.857143,"KNeighborsClassifier(algorithm='auto', leaf_si...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."


In [13]:
start_time = time.time()
result = coins.classification.findBestClassifier([dfPersonality, dfImageDescriptions], dfSocioDemographics, inputFeatureCombination=True)
end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))
result

completed: registration_age
completed: job_status_edu_parttime
completed: job_status_edu_fulltime
completed: job_status_employed_parttime
completed: job_status_employed_fulltime
completed: job_status_selfemployed
completed: job_status_houskeeping
completed: job_status_unemployed
completed: gender_f
completed: gender_m
completed: registration_ageKat_10-19
completed: registration_ageKat_20-29
completed: registration_ageKat_30-39
completed: registration_ageKat_40-49
completed: registration_ageKat_50-59
completed: country_AT
completed: country_CH
completed: country_DE
completed: country_IT
completed: work_country_CH
completed: work_country_DE
completed: work_district_AG
completed: work_district_BE
completed: work_district_BL
completed: work_district_BS
completed: work_district_GR
completed: work_district_HE
completed: work_district_LU
completed: work_district_NI
completed: work_district_NW
completed: work_district_SG
completed: work_district_SO
completed: work_district_TG
completed: work_d

Unnamed: 0,TargetFeature,InputFeature,BestAlgorithm,R^2,Accuracy,Model,PCA,Standard Scaler
0,registration_age,reasons_translation_joyCategory| reasons_trans...,"KNN Classifier, Degree: 7",-0.404898,0.25,"KNeighborsClassifier(algorithm='auto', leaf_si...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
1,job_status_edu_parttime,neurotizismusCategory,Logistic Regression,-0.555556,0.642857,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
2,job_status_edu_fulltime,no input feature with p-value below 0.05,-,-,-,-,-,-
3,job_status_employed_parttime,no input feature with p-value below 0.05,-,-,-,-,-,-
4,job_status_employed_fulltime,strengths_translation_joyCategory| story_trans...,Logistic Regression,-0.166667,0.714286,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
...,...,...,...,...,...,...,...,...
65,"educational_achievement_Gymnasiale Maturität, ...",no input feature with p-value below 0.05,-,-,-,-,-,-
66,educational_achievement_Höhere Fach- und Beruf...,gewissenhaftigkeitCategory,Logistic Regression,-0.333333,0.75,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
67,educational_achievement_Höhere Fachschule lusi...,utilization_translation_joyCategory,Logistic Regression,-0.0769231,0.928571,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
68,"educational_achievement_Master, Lizentiat, Dip...",emotions_translation_joyCategory,Logistic Regression,-0.166667,0.857143,"LogisticRegression(C=1.0, class_weight=None, d...","PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
