In [76]:
import http.client as http_client
http_client.HTTPConnection.debuglevel = 0
from watson_developer_cloud import PersonalityInsightsV3, ToneAnalyzerV3, WatsonApiException
import pandas as pd, numpy as np, os

PersonalityAPI = 'xLGWjB2AUBrzzOoNzXbtB6eucNqqww_28anqfi70YFJ-'
ToneAPI = '-uXV4OpbD7DMEbkqk3UhepcDEDk0oqtYguer2cdHj9x7'

personality_insights = PersonalityInsightsV3(version = '2018-07-13', iam_api_key = PersonalityAPI, url = 'https://gateway-syd.watsonplatform.net/personality-insights/api')
personality_insights.set_default_headers({'x-watson-learning-opt-out': 'true'})

tone_analyzer = ToneAnalyzerV3(version = '2016-05-19', iam_api_key = ToneAPI, url = 'https://gateway-syd.watsonplatform.net/tone-analyzer/api')
tone_analyzer.set_default_headers({'x-watson-learning-opt-out': 'true'})


class Personality():
    '''Get Personality of User'''
    def __init__(self, text):
        self.profile = self._GetPersonality(text)
        self.X = self._processProfile()
        
    def _GetPersonality(self, text):
        try: return personality_insights.profile(text, content_type = 'text/plain', content_language = 'en', raw_scores = True, consumption_preferences = True)
        except WatsonApiException as ex: print("Method failed with status code " + str(ex.code) + ": " + ex.message); return None;
            
    def _processProfile(self):
        '''------------------First, extract the big 5 personalities and traits.'''
        personalities = []; big_5 = []
        profile = self.profile
        
        for x in profile['personality']:
            traits = pd.DataFrame(x['children'])
            del traits['category']; del traits['trait_id']; del traits['significant'];
            traits['type'] = x['trait_id'].split('_')[1]
            personalities.append(traits)
            big_5_one = pd.DataFrame.from_dict({'name':x['name'], 'percentile':x['percentile'], 'raw_score':x['raw_score'], 'type':'big5'},
                                                orient = 'index').T
            big_5.append(big_5_one)

        personality_profile = pd.concat(personalities)
        personality_profile = pd.concat([personality_profile, pd.concat(big_5)])

        '''------------------Next, Get Needs'''
        needs = pd.DataFrame(profile['needs'])
        needs['type'] = 'needs'; del needs['category']; del needs['significant']; del needs['trait_id'];

        '''------------------Next, Get Values'''
        values = pd.DataFrame(profile['values'])
        values['type'] = 'values'; del values['category']; del values['significant']; del values['trait_id'];

        '''------------------Next, Get consumption_preferences'''
        consume = []
        for x in profile['consumption_preferences']:
            habits = pd.DataFrame(x['consumption_preferences'])
            habits['type'] = 'consume_'+x['name']
            del habits['consumption_preference_id'];
            habits.columns = ['name','raw_score','type']
            consume.append(habits)

        consume_profile = pd.concat(consume)

        '''------------------Merge into 1 large DataFrame'''
        user_profile = pd.concat([personality_profile, needs, values, consume_profile])
        user_profile.reset_index(drop = True, inplace = True)
        
        return user_profile
    
    
    
class Tone():
    '''Get Tone of text'''
    def __init__(self, text):
        self.tone = self._GetTone(text)
        self.X = self._processTone()
        
    def _GetTone(self, text):
        try: return tone_analyzer.tone(tone_input = text, content_type = 'text/plain', sentences = False, content_language = 'en', tones = ['language'])
        except WatsonApiException as ex: print("Method failed with status code " + str(ex.code) + ": " + ex.message); return None;

    def _processTone(self):
        tone = self.tone
        tones = tone['document_tone']['tone_categories'][0]['tones']
        tones = pd.DataFrame(tones)
        tones['type'] = 'tone'
        tones['name'] = tones['tone_name']
        del tones['tone_id']; del tones['tone_name'];
        tones.columns = ['raw_score','type','name']
        return tones
    
    

class BuildCorpus():
    def __init__(self, directory):
        self.directory = directory
        try: 
            self.files = os.listdir(directory+'/X')
            self.seen = os.listdir(directory+'/X_clean')
        except: print('Directory = {} is wrong.'.format(directory)); return None;
    
    def _build(self):
        not_done = [x.split('.txt')[0].lower() for x in self.files]
        done = [x.split('.csv')[0].lower() for x in self.seen]
        need_to_do = [x+'.txt' for x in list(set(not_done) - set(done) - set('index'))]
        print(need_to_do)
        
        for x in need_to_do:
            try: file = open(self.directory+'/X/{}'.format(x), 'rb').readlines()
            except: file = open(self.directory+'/X/{}'.format(x.split('.txt')[0]), 'rb').readlines()
            files = []
            for i in file:
                try: files.append(i.decode('utf-8'))
                except: 
                    try: files.append(i.decode('latin-1'))
                    except: files.append(i)
            file = '. '.join(files)
            
            personality_profile = Personality(file).X
            tone_profile = Tone(file).X
            data = pd.concat([personality_profile, tone_profile])
            data.reset_index(drop = True, inplace = True)
            data.to_csv('{}/X_clean/{}.csv'.format(self.directory, x.split('.txt')[0]), index = False)
            
    def read(self):
        seen = os.listdir(self.directory+'/X_clean')
        datas = []
        for x in seen:
#             print(self.directory+'/X_clean/'+x)
            try: data = pd.read_csv(self.directory+'/X_clean/'+x)
            except: data = pd.read_csv(self.directory+'/X_clean/'+x, encoding = 'latin-1')
            datas.append(data)
        return datas, seen
    
def Profile(text):
    personality_profile = Personality(text).X
    tone_profile = Tone(text).X
    data = pd.concat([personality_profile, tone_profile])
    data.reset_index(drop = True, inplace = True)
    return data

In [10]:
BuildCorpus('C:/Users/daniel.DESKTOP-JL0PCLC/Desktop/maker')._build()

TypeError: unsupported operand type(s) for -: 'set' and 'str'

In [96]:
data_corpus, names = BuildCorpus('C:/Users/daniel.DESKTOP-JL0PCLC/Desktop/maker').read()

In [52]:
scores = []
for df in data_corpus:
    scores.append(df['raw_score'])
    
result = pd.concat(scores, 1)
result.columns = names
result.index = data_corpus[0]['type'] + '('+data_corpus[0]['name']+')'
result.loc[[x for x in result.index if 'consume' in x]] *= 0.2

In [54]:
data = pd.read_csv('C:/Users/daniel.DESKTOP-JL0PCLC/Desktop/University-major_July-23-2018_10.50.csv').iloc[2:]
data

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,Q4,Q4_8_TEXT,Q5,Q6,Q7.1,Q8.1,Q9,Q10,Q11,Q4_8_TEXT - Topics
2,29-06-18 21:15,01-07-18 22:05,IP Address,101.191.72.174,100,175828,True,01-07-18 22:05,R_1Ic0LOMqFtleRMu,,...,Asian,,College (undergraduate),"$50,000 ~ $ 99,999",Transportation and Material Moving,24,4,"$100,000 ~ $ 149,999","$5,000 ~ $ 9,999",
3,01-07-18 22:05,01-07-18 22:49,IP Address,101.191.72.174,100,2653,True,01-07-18 22:49,R_1P0jmAQEk3sDjYC,,...,Asian,,College (undergraduate),"$50,000 ~ $ 99,999",Transportation and Material Moving,Sales and Related,4,"$100,000 ~ $ 149,999","$5,000 ~ $ 9,999",
4,03-07-18 21:30,03-07-18 22:59,IP Address,101.191.72.174,100,5344,True,03-07-18 22:59,R_24pUPVsqhb6wqWZ,,...,Asian,,College (undergraduate),"$50,000 ~ $ 99,999",Transportation and Material Moving,Sales and Related,4,I don’t know,"$5,000 ~ $ 9,999",
5,11-07-18 10:10,11-07-18 13:48,IP Address,149.171.209.110,100,13084,True,11-07-18 13:48,R_2bZ9BLKSUXTRvb8,,...,Asian,,I don’t want to disclose,I don’t know,I don’t wish to disclose,I don’t wish to disclose,2,"$ 200,000 above","$10,000 ~ $ 29,999",
6,04-07-18 17:36,04-07-18 17:42,IP Address,49.195.146.180,3,318,False,11-07-18 17:42,R_2PhuUdVkN9G3yqX,,...,,,,,,,,,,
7,12-07-18 8:45,12-07-18 10:04,IP Address,103.28.128.201,100,4735,True,12-07-18 10:04,R_0p0QarKt9ipfd6x,,...,Asian,,High school (junior and/or senior high school),I don’t know,Business and Financial Operations,Unemployed or retired,1,"$50,000 ~ $ 99,999","$70,000 above",
8,05-07-18 18:49,05-07-18 18:49,IP Address,122.106.195.92,3,19,False,12-07-18 18:49,R_1nNQanHsifDIzoT,,...,,,,,,,,,,
9,12-07-18 11:23,13-07-18 11:48,IP Address,58.109.77.132,100,87874,True,13-07-18 11:48,R_2dRwCc8oxz9ek0k,,...,I don’t want to disclose,,College (undergraduate),I don’t want to disclose,Architecture and Engineering,Community and Social Service,1,I don’t want to disclose,I don’t want to disclose,
10,11-07-18 20:17,17-07-18 20:13,IP Address,114.74.62.136,100,518209,True,17-07-18 20:13,R_2xRfOtLWKxRZrbj,,...,Asian,,College (undergraduate),I don’t want to disclose,I don’t wish to disclose,I don’t wish to disclose,1,"$ 200,000 above","Below $ 5,000",
11,10-07-18 22:47,10-07-18 22:47,IP Address,114.74.62.136,3,5,False,17-07-18 22:47,R_bjwIaqzHVkgK8mt,,...,,,,,,,,,,


In [78]:
columns_needed = ['Q3_4_TEXT', 'Essay 1', 'Essay2']
text = data[columns_needed[0]].fillna('').str.replace('\n',' ')
for x in columns_needed[1:]:
    text += ' '
    text += data[x].fillna('').str.replace('\n',' ')
text = text.values

In [87]:
people = []
for x in text:
    try:
        profile = Profile(x)
        people.append(profile)
    except:
        people.append(np.nan)

Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analysis: 100
Method failed with status code 400: The number of words 0 is less than the minimum number of words required for analys

In [104]:
people_scores = []
for df in people:
    try:
        people_scores.append(df['raw_score'])
    except:
        people_scores.append(data_corpus[0]['raw_score']*np.nan)
people_scores = pd.concat(people_scores, 1)
people_scores.index = data_corpus[0]['type'] + '('+data_corpus[0]['name']+')'

In [105]:
people_scores.to_csv('25-07-2018_profiles.csv')

In [107]:
people_scores.loc[[x for x in people_scores.index if 'consume' in x]]

Unnamed: 0,raw_score,raw_score.1,raw_score.2,raw_score.3,raw_score.4,raw_score.5,raw_score.6,raw_score.7,raw_score.8,raw_score.9,...,raw_score.10,raw_score.11,raw_score.12,raw_score.13,raw_score.14,raw_score.15,raw_score.16,raw_score.17,raw_score.18,raw_score.19
consume_Purchasing Preferences(Likely to be sensitive to ownership cost when buying automobiles),1.0,1.0,1.0,0.5,,1.0,,1.0,1.0,,...,1.0,1.0,1.0,0.0,,1.0,,1.0,1.0,1.0
consume_Purchasing Preferences(Likely to prefer safety when buying automobiles),0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0
consume_Purchasing Preferences(Likely to prefer quality when buying clothes),1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,,...,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0
consume_Purchasing Preferences(Likely to prefer style when buying clothes),1.0,0.0,1.0,1.0,,1.0,,1.0,1.0,,...,1.0,1.0,1.0,1.0,,1.0,,1.0,0.0,1.0
consume_Purchasing Preferences(Likely to prefer comfort when buying clothes),1.0,1.0,1.0,0.0,,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,,0.0,,0.0,1.0,0.0
consume_Purchasing Preferences(Likely to be influenced by brand name when making product purchases),0.0,0.0,0.0,0.0,,1.0,,0.0,1.0,,...,1.0,0.0,1.0,1.0,,1.0,,1.0,0.0,1.0
consume_Purchasing Preferences(Likely to be influenced by product utility when making product purchases),0.0,1.0,0.0,1.0,,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.5
consume_Purchasing Preferences(Likely to be influenced by online ads when making product purchases),1.0,1.0,1.0,1.0,,0.0,,1.0,0.0,,...,1.0,1.0,1.0,1.0,,0.0,,1.0,1.0,0.0
consume_Purchasing Preferences(Likely to be influenced by social media when making product purchases),0.0,0.0,0.0,0.0,,0.0,,1.0,0.0,,...,0.0,0.0,1.0,0.0,,0.0,,0.0,0.0,0.0
consume_Purchasing Preferences(Likely to be influenced by family when making product purchases),0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0


In [6]:
BuildCorpus(directory = 'C:/Users/daniel.DESKTOP-JL0PCLC/Desktop/maker')._build()

['socialscience_treatise on peace and the politics of identity.txt', 'business_why a company tax cut may not work.txt', 'liberalarts_does being bilingual make you sexy.txt', 'socialscience_how ethical is sexual assault research.txt', 'liberalarts_warwick egg incident of 1917.txt', 'engineering_student computer programmers shine on world stage.txt', 'engineering_producing oil with negative co2 emissions.txt', 'socialscience_hands off campaign targeting sexual harassment.txt', 'business_unsw business school academics research indian consumers.txt', 'engineering_optical expert recognised with international fellowship.txt', 'engineering_up up up and away.txt', 'socialscience_devastating impact of growing up in care.txt', 'engineering_ting ting.txt', 'engineering_bluesatblogaerospacesfutures.txt', 'business_introducing your student representative.txt', 'business_unsw business school examines why there are few women in leading financial roles.txt', 'engineering_tech leaders sign global pledg