In this script, we will explore the distribution of participants according to main characteristics.

The aim is to use stratified sampling instead of random sampling when splitting the original dataset into training and testing set for classification.

#### 1. Imports and Set Up

In [114]:
import os
import pandas as pd
import numpy as np

#### Set up working directory

cwd = os.chdir('/Users/alessia/Documents/DataScience/NLP_Project/Outputs')

#### 2. Get Data

In [115]:
# Read in data (note header is spread over two rows)

cons1_df = pd.read_csv("cons1_df.csv")

In [116]:
# Explore data

[print(str(num) + ' = ' + question) for num, question in enumerate(cons1_df.columns)]

0 = Unnamed: 0
1 = Respondent ID
2 = Collector ID
3 = Start Date
4 = End Date
5 = IP Address
6 = Email Address
7 = First Name
8 = Last Name
9 = Custom Data 1
10 = Are you responding on behalf of an organisation, or as an individual?Response
11 = If you are responding on behalf of an organisation, please provide details below. Organisation name
12 = If you are responding on behalf of an organisation, please provide details below. Contact name
13 = If you are responding on behalf of an organisation, please provide details below. Email address
14 = If you are responding on behalf of an organisation, please provide details below. Telephone number
15 = If you are responding as an individual, please provide details. Name
16 = If you are responding as an individual, please provide details. Email address
17 = If you are responding as an individual, please provide details. Telephone number
18 = Public sectorResponse
19 = Public sectorOther public sector (please specify)
20 = Private sectorRespo

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [117]:
# Key characteristics seem to be in columns:
# 10 organisation vs. individual
# 18 Public sector (also check 19)
# 20 Private sector (also check 21)
# 22 Other sectors

# Secondary characteristics we may want to look at:
# 24-39 Type of use of pop and housing statstics

#### 3. Explore distribution of main characteristics

In [118]:
# 10 = Are you responding on behalf of an organisation, or as an individual?Response

print("Number of NaN: {}".format(cons1_df.iloc[:, 10].isnull().sum()))
print(cons1_df.iloc[:, 10].value_counts())

Number of NaN: 23
Individual      811
Organisation    274
Name: Are you responding on behalf of an organisation, or as an individual?Response, dtype: int64


In [119]:
# 18 Public Sector
print("Number of NaN in Public sector: {}".format(cons1_df.iloc[:, 18].isnull().sum()))
print(cons1_df.iloc[:, 18].value_counts())

# 19 Public Sector
print("Number of other Public sectors: {}".format(cons1_df.iloc[:, 19].count()))
print(cons1_df.iloc[:, 19].value_counts())


# Could combine this into one variable call "Public Sector" (1 / 0) and another one "Public sector type" 
#(Local or sub-national government, Central government department or agency, University, Health, School or college, Other)

Number of NaN in Public sector: 947
Local or sub-national government           119
Central government department or agency     21
University                                  14
Health                                       5
School or college                            2
Name: Public sectorResponse, dtype: int64
Number of other Public sectors: 27
Parish Council                                                                                                                                                                        2
town and parish councils                                                                                                                                                              2
Higher Education administration                                                                                                                                                       1
Non-Departmental Public Body                                                                        

In [120]:
# 20 Private Sector
print("Number of NaN in Private Sector: {}".format(cons1_df.iloc[:, 20].isnull().sum()))
print(cons1_df.iloc[:, 20].value_counts())

# 21 Private Sector
print("Number of other Private sectors: {}".format(cons1_df.iloc[:, 21].count()))
print(cons1_df.iloc[:, 21].value_counts())


Number of NaN in Private Sector: 1098
Service industry     3
Manufacturing        3
University           2
Health               1
School or college    1
Name: Private sectorResponse, dtype: int64
Number of other Private sectors: 18
Housing Provision                                                                                                                                        1
Professional Body                                                                                                                                        1
Genealogy                                                                                                                                                1
Media Research                                                                                                                                           1
Housing                                                                                                                                             

In [121]:
# 18 Other Sectors
print("Number of NaN: {}".format(cons1_df.iloc[:, 22].isnull().sum()))
print(cons1_df.iloc[:, 22].value_counts())

Number of NaN: 1051
Voluntary / Community / Non-profit    51
Media                                  6
Name: Other sectorsResponse, dtype: int64


In [122]:
# Create a new dummy variable that tracks whether the record reported to be from Public Sector

cond_PubSec = cons1_df.iloc[:,18].notnull() | cons1_df.iloc[:,19].notnull()

cons1_df['PublicSector'] = [int(val) for val in cond_PubSec]

In [123]:
# check
cons1_df.iloc[:, [18, 19, -1]].head()

Unnamed: 0,Public sectorResponse,Public sectorOther public sector (please specify),PublicSector
0,,,0
1,,,0
2,,,0
3,,,0
4,Local or sub-national government,,1


In [124]:
# Create a new dummy variable that tracks whether the record reported to be from Private Sector

cond_PrvSec = cons1_df.iloc[:,20].notnull() | cons1_df.iloc[:,21].notnull()

cons1_df['PrivateSector'] = [int(val) for val in cond_PrvSec]

In [125]:
# check
cons1_df.iloc[:, [20, 21, -1]].head()

Unnamed: 0,Private sectorResponse,Private sectorOther private sector (please specify),PrivateSector
0,,,0
1,,,0
2,,,0
3,,Market Research,1
4,,,0


In [126]:
# Create a new dummy variable that tracks whether the record reported to be from Other Sectors (neither Public nor Public)

cond_OthSec = cons1_df.iloc[:,22].notnull()

cons1_df['OtherSectors'] = [int(val) for val in cond_OthSec]

In [127]:
# check
cons1_df.iloc[:, [22, -1]].head(10)

Unnamed: 0,Other sectorsResponse,OtherSectors
0,,0
1,,0
2,,0
3,,0
4,,0
5,Voluntary / Community / Non-profit,1
6,,0
7,,0
8,,0
9,,0


In [128]:
# Crossabs across the 3 main Sector classes
pd.crosstab([cons1_df.iloc[:,-3], cons1_df.iloc[:,-2]], cons1_df.iloc[:,-1], margins=True)   #

Unnamed: 0_level_0,OtherSectors,0,1,All
PublicSector,PrivateSector,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,849,53,902
0,1.0,22,1,23
1,0.0,177,1,178
1,1.0,3,2,5
All,,1051,57,1108


In [129]:
# 7 respondents selected from more than one Sector type, so their sector class is ambiguous...

#### 4. Get VADER polarity score for each cell text, for stratified sampling

In [130]:
cwd = os.chdir('/Users/alessia/Documents/DataScience/textconsultations/')

In [131]:
os.listdir()

['nlpfunctions', 'tutorial', 'README.md', '.git']

In [132]:
os.listdir('nlpfunctions')

['similarities.py',
 'textranksummary.py',
 '__init__.py',
 '__pycache__',
 'basic_NLP_functions.py']

In [133]:
import nlpfunctions.basic_NLP_functions as b_nlp

In [134]:
dir(b_nlp)

['POS_tagging_df',
 'SentimentIntensityAnalyzer',
 'TextBlob',
 'WordNetLemmatizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'analyser',
 'break_words_df',
 'classify_subjectivity_df',
 'fix_neg_aux_df',
 'get_sentiment_score_df',
 'get_subjectivity_df',
 'get_wordnet_pos',
 'lemmatise_df',
 'list2string_df',
 'np',
 'pd',
 'pos_tag',
 'remove_objective_sents_df',
 'remove_punctuation_df',
 'remove_stopwords_df',
 'sent_tokenise_df',
 'sent_tokenize',
 'stopwords',
 'string',
 'word_detokenise_sent_df',
 'word_tokenise_df',
 'word_tokenize',
 'wordnet',
 'wordnet_lemmatiser']

In [135]:
# rename relevant question
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))

In [136]:
cons1_df.columns[idx_Q1]

'1. What are your views of the different census methods described in the consultation document?Open-Ended Response'

In [137]:
cons1_df.rename(columns = {'1. What are your views of the different census methods described in the consultation document?Open-Ended Response':'Q1_census_methods'}, inplace = True) 

Sentence-tokenise text

In [138]:
cons1_df['Q1_census_methods']; #ok

In [139]:
print("Number of NaN in Q1: {}".format(cons1_df['Q1_census_methods'].isnull().sum()))  #372 NaN
cons1_df.shape

Number of NaN in Q1: 372


(1108, 54)

In [140]:
type(cons1_df['Q1_census_methods'])

pandas.core.series.Series

In [141]:
cons1_df['Q1_census_methods'].head()

0                                                  NaN
1                                                  NaN
2                                                  NaN
3    Moving to a primarily online census: an inevit...
4    A regular full population census is absolutely...
Name: Q1_census_methods, dtype: object

In [142]:
[print(type(cell)) for cell in cons1_df['Q1_census_methods'].head()]

<class 'float'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>


[None, None, None, None, None]

In [170]:
#cons1_df['Q1_census_methods'].apply(lambda x : b_nlp.sent_tokenise_df(x))

In [145]:
from nltk.tokenize import sent_tokenize
import pandas as pd

def sent_tokenise_answer(INPUT) :
    
    """ 
    Function to sentence-tokenise answers. 
    Return a list of lists with each sublist containing an answer's sentences as strings.
    
    Parameters
    ----------
    INPUT : name of the dataframe column containing the list of sentences to be word-tokenised
    """
    
    # if no answer was provided -> return empty string list, else sent-tokenize answer
    OUTPUT = sent_tokenize(INPUT) if (INPUT and isinstance(INPUT, str)) else list()
            
    return pd.Series(dict(sent_tok_text = OUTPUT))


In [148]:
cons1_df['sent_tok_text'] = cons1_df['Q1_census_methods'].apply(lambda x: sent_tokenise_answer(x))

In [151]:
cons1_df['SA_scores_sents'] = cons1_df['sent_tok_text'].apply(lambda x: b_nlp.get_sentiment_score_df(x))

In [155]:
# take the average for each cell/text
cons1_df['mean_SA_scores'] = cons1_df['SA_scores_sents'].apply(lambda x: np.mean(x))

In [165]:
cons1_df['SA_polarity'] = cons1_df['mean_SA_scores'].apply(lambda x: 'pos' if x > 0 else 'neg' if x < 0 else "")

In [166]:
cons1_df.iloc[:, -3:].head(10)

Unnamed: 0,SA_scores_sents,mean_SA_scores,SA_polarity
0,,,
1,,,
2,,,
3,"[0.0, -0.4585]",-0.22925,neg
4,"[0.0, 0.3818, 0.0, 0.4404, 0.0, 0.0, 0.4404, 0...",0.257857,pos
5,"[0.743, 0.3818, 0.4588, 0.34, 0.7845]",0.54162,pos
6,,,
7,"[0.3612, -0.743, 0.0, 0.4939, 0.0, 0.6802, -0....",0.055771,pos
8,"[0.0, 0.3818, 0.3182, -0.6124]",0.0219,pos
9,"[0.0, -0.3182, 0.9513, 0.631, 0.5994, 0.5994]",0.410483,pos


In [168]:
cons1_df['SA_polarity'].value_counts()    #535 pos, 133 neg, ""440

pos    535
       440
neg    133
Name: SA_polarity, dtype: int64

In [173]:
# sample 100 positive and 100 negative sentence: 30% from organisations, 70% from individuals
pd.crosstab(cons1_df['SA_polarity'], cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'])

"Are you responding on behalf of an organisation, or as an individual?Response",Individual,Organisation
SA_polarity,Unnamed: 1_level_1,Unnamed: 2_level_1
,378,46
neg,99,33
pos,334,195


In [183]:
# create strata
pos_org_cond = ((cons1_df['SA_polarity'] == 'pos') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Organisation'))
neg_org_cond = ((cons1_df['SA_polarity'] == 'neg') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Organisation'))

pos_ind_cond = ((cons1_df['SA_polarity'] == 'pos') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Individual'))
neg_ind_cond = ((cons1_df['SA_polarity'] == 'neg') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Individual'))


In [188]:
# sample sentences
import random
random.seed(11)

pos_org_sample = cons1_df[pos_org_cond].sample(30)
neg_org_sample = cons1_df[neg_org_cond].sample(30)

pos_ind_sample = cons1_df[pos_ind_cond].sample(70)
neg_ind_sample = cons1_df[neg_ind_cond].sample(70)

In [189]:
print(pos_org_sample.shape[0])
print(neg_org_sample.shape[0])
print(pos_ind_sample.shape[0])
print(neg_ind_sample.shape[0])

30
30
70
70


In [190]:
datas = [pos_org_sample, neg_org_sample, pos_ind_sample, neg_ind_sample]

sa_q1_sample = pd.concat(datas)

In [199]:
sa_q1_sample = sa_q1_sample.iloc[:, [1, 40, 10, -7, -6, -5, -4, -3, -2, -1]]

In [201]:
sa_q1_sample.to_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/sa_q1_sample.csv")