In [218]:
import http.client as http_client
http_client.HTTPConnection.debuglevel = 0
from watson_developer_cloud import PersonalityInsightsV3, ToneAnalyzerV3, WatsonApiException
import pandas as pd, numpy as np, os

PersonalityAPI = 'xLGWjB2AUBrzzOoNzXbtB6eucNqqww_28anqfi70YFJ-'
ToneAPI = '-uXV4OpbD7DMEbkqk3UhepcDEDk0oqtYguer2cdHj9x7'

personality_insights = PersonalityInsightsV3(version = '2018-07-13', iam_api_key = PersonalityAPI, url = 'https://gateway-syd.watsonplatform.net/personality-insights/api')
personality_insights.set_default_headers({'x-watson-learning-opt-out': 'true'})

tone_analyzer = ToneAnalyzerV3(version = '2016-05-19', iam_api_key = ToneAPI, url = 'https://gateway-syd.watsonplatform.net/tone-analyzer/api')
tone_analyzer.set_default_headers({'x-watson-learning-opt-out': 'true'})


class Personality():
    '''Get Personality of User'''
    def __init__(self, text):
        self.profile = self._GetPersonality(text)
        self.X = self._processProfile()
        
    def _GetPersonality(self, text):
        try: return personality_insights.profile(text, content_type = 'text/plain', content_language = 'en', raw_scores = True, consumption_preferences = True)
        except WatsonApiException as ex: print("Method failed with status code " + str(ex.code) + ": " + ex.message); return None;
            
    def _processProfile(self):
        '''------------------First, extract the big 5 personalities and traits.'''
        personalities = []; big_5 = []
        profile = self.profile
        
        for x in profile['personality']:
            traits = pd.DataFrame(x['children'])
            del traits['category']; del traits['trait_id']; del traits['significant'];
            traits['type'] = x['trait_id'].split('_')[1]
            personalities.append(traits)
            big_5_one = pd.DataFrame.from_dict({'name':x['name'], 'percentile':x['percentile'], 'raw_score':x['raw_score'], 'type':'big5'},
                                                orient = 'index').T
            big_5.append(big_5_one)

        personality_profile = pd.concat(personalities)
        personality_profile = pd.concat([personality_profile, pd.concat(big_5)])

        '''------------------Next, Get Needs'''
        needs = pd.DataFrame(profile['needs'])
        needs['type'] = 'needs'; del needs['category']; del needs['significant']; del needs['trait_id'];

        '''------------------Next, Get Values'''
        values = pd.DataFrame(profile['values'])
        values['type'] = 'values'; del values['category']; del values['significant']; del values['trait_id'];

        '''------------------Next, Get consumption_preferences'''
        consume = []
        for x in profile['consumption_preferences']:
            habits = pd.DataFrame(x['consumption_preferences'])
            habits['type'] = 'consume_'+x['name']
            del habits['consumption_preference_id'];
            habits.columns = ['name','raw_score','type']
            consume.append(habits)

        consume_profile = pd.concat(consume)

        '''------------------Merge into 1 large DataFrame'''
        user_profile = pd.concat([personality_profile, needs, values, consume_profile])
        user_profile.reset_index(drop = True, inplace = True)
        
        return user_profile
    
    
    
class Tone():
    '''Get Tone of text'''
    def __init__(self, text):
        self.tone = self._GetTone(text)
        self.X = self._processTone()
        
    def _GetTone(self, text):
        try: return tone_analyzer.tone(tone_input = text, content_type = 'text/plain', sentences = False, content_language = 'en', tones = ['language'])
        except WatsonApiException as ex: print("Method failed with status code " + str(ex.code) + ": " + ex.message); return None;

    def _processTone(self):
        tone = self.tone
        tones = tone['document_tone']['tone_categories'][0]['tones']
        tones = pd.DataFrame(tones)
        tones['type'] = 'tone'
        tones['name'] = tones['tone_name']
        del tones['tone_id']; del tones['tone_name'];
        tones.columns = ['raw_score','type','name']
        return tones
    
    

class BuildCorpus():
    def __init__(self, directory):
        self.directory = directory
        try: 
            self.files = os.listdir(directory+'/X')
            self.seen = os.listdir(directory+'/X_clean')
        except: print('Directory = {} is wrong.'.format(directory)); return None;
    
    def _build(self):
        not_done = [x.split('.txt')[0].lower() for x in self.files]
        done = [x.split('.csv')[0].lower() for x in self.seen]
        need_to_do = [x+'.txt' for x in list(set(not_done) - set(done))]
        
        for x in need_to_do:
            file = open(self.directory+'/X/{}'.format(x), 'rb').readlines()
            files = []
            for i in file:
                try: files.append(i.decode('utf-8'))
                except: 
                    try: files.append(i.decode('latin-1'))
                    except: files.append(i)
            file = '. '.join(files)
            
            personality_profile = Personality(file).X
            tone_profile = Tone(file).X
            data = pd.concat([personality_profile, tone_profile])
            data.reset_index(drop = True, inplace = True)
            data.to_csv('{}/X_clean/{}.csv'.format(self.directory, x.split('.txt')[0]), index = False)

In [None]:
!pip install -U -e git+https://bitbucket.org/fomcl/savreaderwriter.git#egg=savreaderwriter

In [246]:
data = pd.read_csv('C:/Users/daniel.DESKTOP-JL0PCLC/Desktop/University-major_July-13-2018_19.56.csv').iloc[2:]
print(list(data.columns))

['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'Q1', 'Q35', 'Q37', 'Q2', 'Q2_5_TEXT', 'Q3', 'Q3_4_TEXT', 'Q39_1', 'Q39_2', 'Q39_3', 'Q7', 'Q8', 'Q8_1_TEXT', 'Q9_1', 'Q9_2', 'Q9_3', 'Q9_4', 'Q9_5', 'Q9_6', 'Q9_7', 'Q9_8', 'Q10_1', 'Q10_2', 'Q10_3', 'Q10_4', 'Q10_5', 'Q10_6', 'Q10_7', 'Q10_8', 'Q11_1', 'Q11_2', 'Q11_3', 'Q11_4', 'Q11_5', 'Q11_6', 'Q11_7', 'Q11_8', 'Q12_1', 'Q12_2', 'Q12_3', 'Q12_4', 'Q12_5', 'Q12_6', 'Q12_7', 'Q12_8', 'Q13', 'Q41_1', 'Q41_2', 'Essay 1', 'Essay2', 'Q1_1', 'Q1_2', 'Q1_3', 'Q1_4', 'Q1_5', 'Q1_6', 'Q1_7', 'Q1_8', 'Q1_9', 'Q1_10', 'Q1_11', 'Q1_12', 'Q1_13', 'Q1_14', 'Q1_15', 'Q1_16', 'Q1_17', 'Q1_18', 'Q1_19', 'Q1_20', 'Q1_21', 'Q1_22', 'Q1_23', 'Q1_24', 'Q1_25', 'Q1_26', 'Q1_27', 'Q1_28', 'Q1_29', 'Q1_30', 'Q1_31', 'Q1_3

In [348]:
columns_needed = ['Q3_4_TEXT', 'Essay 1', 'Essay2']
data[columns_needed]

Unnamed: 0,Q3_4_TEXT,Essay 1,Essay2
2,I was inspired by what I can do with these ski...,I did not choose my university before my major...,My most important experience would be my schoo...
3,None of the above. I was interested in this cu...,Prompt 1\nI chose UNSW for a number of reasons...,Prompt 2\nI haven't gone to many sleepovers an...
4,,#1\nI chose UNSW because I like the campus and...,#2\nMy most important experience was the time ...
5,,4)\n\nI have worked in labs as research assist...,"3)\nIn my perception, I generally work well al..."
6,,,
7,,#1. \nMy undergraduate major is business admin...,#4\nI used to learn Japanese in Tokyo for 1 ye...
8,,,
9,,#1\n1.1 Why did I choose UNSW\nThe University ...,#4\n1.1\tWorking with friends in university\nI...


In [219]:
BuildCorpus(directory = 'C:/Users/daniel.DESKTOP-JL0PCLC/Desktop/maker')._build()

'0\n. "UNSW researchers win ticket to Nobel summitTwitter Facebook LinkedIn18 JUN 2015   AMY COOPESA UNSW cardiologist and a lung cancer researcher have been selected from a competitive field of Australian early career scientists to attend the 65th Lindau Nobel Laureates meeting in Germany.A UNSW cardiologist and a lung cancer researcher have been selected from a competitive field of Australian early career scientists to attend the 65th Lindau Nobel Laureates meeting in Germany.Amelia Parker and Dr Louis Wang will get the chance to exchange ideas with some of the world’s brightest minds in medicine, physics and chemistry at the summit, which will feature 66 Nobel winners and more than 650 delegates from almost 90 countries.Just 13 Australians were selected from a field of more than 70,000 young researchers worldwide, and will attend the Lindau summit on a Science and Industry Endowment Fund – Australian Academy of Science Fellowship.Parker studies non-small cell lung tumours and the ro

In [33]:
text = '''pandas.DataFrame.pivot_table
DataFrame.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')[source]
Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame

Parameters:	
values : column to aggregate, optional
index : column, Grouper, array, or list of the previous

If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values.

columns : column, Grouper, array, or list of the previous

If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values.

aggfunc : function, list of functions, dict, default numpy.mean

If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) If dict is passed, the key is column to aggregate and value is function or list of functions

fill_value : scalar, default None

Value to replace missing values with

margins : boolean, default False

Add all row / columns (e.g. for subtotal / grand totals)

dropna : boolean, default True

Do not include columns whose entries are all NaN

margins_name : string, default ‘All’

Name of the row / column that will contain the totals when margins is True.

Returns:	
table : DataFrame
See also
DataFrame.pivot
pivot without aggregation that can handle non-numeric data'''

Unnamed: 0,name,percentile,raw_score,type
0,Adventurousness,0.102446,0.459729,openness
1,Artistic interests,0.711242,0.693523,openness
2,Emotionality,0.0151797,0.559916,openness
3,Imagination,0.43917,0.727255,openness
4,Intellect,0.997123,0.735069,openness
5,Authority-challenging,0.97613,0.615939,openness
6,Achievement striving,0.42362,0.678199,conscientiousness
7,Cautiousness,0.893095,0.570949,conscientiousness
8,Dutifulness,0.183811,0.636837,conscientiousness
9,Orderliness,0.404888,0.486271,conscientiousness


In [170]:
tone = Tone(text)