In this script, we will explore the distribution of participants according to main characteristics.

The aim is to use stratified sampling instead of random sampling when splitting the original dataset into training and testing set for classification.

### 1. Imports and Set Up

In [115]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#### Set up working directory

cwd = os.chdir('/Users/alessia/Documents/DataScience/NLP_Project/Outputs')

In [121]:
pd.set_option('display.max_colwidth', -1)

### 2. Get Data

In [2]:
# Read in data (note header is spread over two rows)

cons1_df = pd.read_csv("cons1_df.csv")

In [3]:
# Explore data

#[print(str(num) + ' = ' + question) for num, question in enumerate(cons1_df.columns)]

Key characteristics seem to be in columns:
- 10 organisation vs. individual
- 18 Public sector (also check 19)
- 20 Private sector (also check 21)
- 22 Other sectors

Secondary characteristics we may want to look at:
- 24-39 Type of use of pop and housing statstics

### 3. Explore distribution of main characteristics

In [5]:
# 10 = Are you responding on behalf of an organisation, or as an individual?Response

print("Number of NaN: {}".format(cons1_df.iloc[:, 10].isnull().sum()))
print(cons1_df.iloc[:, 10].value_counts())

Number of NaN: 23
Individual      811
Organisation    274
Name: Are you responding on behalf of an organisation, or as an individual?Response, dtype: int64


In [6]:
# 18 Public Sector
#print("Number of NaN in Public sector: {}".format(cons1_df.iloc[:, 18].isnull().sum()))
#print(cons1_df.iloc[:, 18].value_counts())

# 19 Public Sector
#print("Number of other Public sectors: {}".format(cons1_df.iloc[:, 19].count()))
#print(cons1_df.iloc[:, 19].value_counts())


# Could combine this into one variable call "Public Sector" (1 / 0) and another one "Public sector type" 
#(Local or sub-national government, Central government department or agency, University, Health, School or college, Other)

In [7]:
# 20 Private Sector
#print("Number of NaN in Private Sector: {}".format(cons1_df.iloc[:, 20].isnull().sum()))
#print(cons1_df.iloc[:, 20].value_counts())

# 21 Private Sector
#print("Number of other Private sectors: {}".format(cons1_df.iloc[:, 21].count()))
#print(cons1_df.iloc[:, 21].value_counts())


In [8]:
# 18 Other Sectors
print("Number of NaN: {}".format(cons1_df.iloc[:, 22].isnull().sum()))
print(cons1_df.iloc[:, 22].value_counts())

Number of NaN: 1051
Voluntary / Community / Non-profit    51
Media                                  6
Name: Other sectorsResponse, dtype: int64


In [9]:
# Create a new dummy variable that tracks whether the record reported to be from Public Sector

cond_PubSec = cons1_df.iloc[:,18].notnull() | cons1_df.iloc[:,19].notnull()

cons1_df['PublicSector'] = [int(val) for val in cond_PubSec]

In [10]:
# check
cons1_df.iloc[:, [18, 19, -1]].head()

Unnamed: 0,Public sectorResponse,Public sectorOther public sector (please specify),PublicSector
0,,,0
1,,,0
2,,,0
3,,,0
4,Local or sub-national government,,1


In [11]:
# Create a new dummy variable that tracks whether the record reported to be from Private Sector

cond_PrvSec = cons1_df.iloc[:,20].notnull() | cons1_df.iloc[:,21].notnull()

cons1_df['PrivateSector'] = [int(val) for val in cond_PrvSec]

In [12]:
# check
cons1_df.iloc[:, [20, 21, -1]].head()

Unnamed: 0,Private sectorResponse,Private sectorOther private sector (please specify),PrivateSector
0,,,0
1,,,0
2,,,0
3,,Market Research,1
4,,,0


In [13]:
# Create a new dummy variable that tracks whether the record reported to be from Other Sectors (neither Public nor Public)

cond_OthSec = cons1_df.iloc[:,22].notnull()

cons1_df['OtherSectors'] = [int(val) for val in cond_OthSec]

In [14]:
# check
cons1_df.iloc[:, [22, -1]].head(10)

Unnamed: 0,Other sectorsResponse,OtherSectors
0,,0
1,,0
2,,0
3,,0
4,,0
5,Voluntary / Community / Non-profit,1
6,,0
7,,0
8,,0
9,,0


In [15]:
# Crossabs across the 3 main Sector classes
pd.crosstab([cons1_df.iloc[:,-3], cons1_df.iloc[:,-2]], cons1_df.iloc[:,-1], margins=True)   #

Unnamed: 0_level_0,OtherSectors,0,1,All
PublicSector,PrivateSector,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,849,53,902
0,1.0,22,1,23
1,0.0,177,1,178
1,1.0,3,2,5
All,,1051,57,1108


In [129]:
# 7 respondents selected from more than one Sector type, so their sector class is ambiguous...

### 4. Get VADER polarity score for each cell text, to use for stratified sampling

In [16]:
cwd = os.chdir('/Users/alessia/Documents/DataScience/textconsultations/')

In [17]:
os.listdir()

['nlpfunctions', 'tutorial', 'README.md', '.git']

In [37]:
os.listdir('nlpfunctions');

In [19]:
import nlpfunctions.basic_NLP_functions as b_nlp



Take a look at the user-defined basic NLP functions

In [20]:
dir(b_nlp)

['POS_tagging_df',
 'SentimentIntensityAnalyzer',
 'TextBlob',
 'WordNetLemmatizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'analyser',
 'break_words_df',
 'classify_subjectivity_df',
 'fix_neg_aux_df',
 'get_sentiment_score_df',
 'get_subjectivity_df',
 'get_textblob_sentiment_score_df',
 'get_wordnet_pos',
 'lemmatise_df',
 'list2string_df',
 'np',
 'pd',
 'pos_tag',
 'remove_objective_sents_df',
 'remove_punctuation_df',
 'remove_stopwords_df',
 'rescale_to_01_df',
 'sent_tokenise_df',
 'sent_tokenize',
 'stopwords',
 'string',
 'word_detokenise_sent_df',
 'word_tokenise_df',
 'word_tokenize',
 'wordnet',
 'wordnet_lemmatiser']

Let's rename the column that contains the relevant question

In [21]:
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))

In [22]:
cons1_df.columns[idx_Q1]

'1. What are your views of the different census methods described in the consultation document?Open-Ended Response'

In [23]:
cons1_df.rename(columns = {'1. What are your views of the different census methods described in the consultation document?Open-Ended Response':'Q1_census_methods'}, inplace = True) 

Sentence-tokenise text

In [24]:
cons1_df['Q1_census_methods']; #ok

In [25]:
print("Number of NaN in Q1: {}".format(cons1_df['Q1_census_methods'].isnull().sum()))  #372 NaN
cons1_df.shape

Number of NaN in Q1: 372


(1108, 54)

In [26]:
type(cons1_df['Q1_census_methods'])

pandas.core.series.Series

In [141]:
cons1_df['Q1_census_methods'].head()

0                                                  NaN
1                                                  NaN
2                                                  NaN
3    Moving to a primarily online census: an inevit...
4    A regular full population census is absolutely...
Name: Q1_census_methods, dtype: object

In [27]:
[print(type(cell)) for cell in cons1_df['Q1_census_methods'].head()]

<class 'float'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>


[None, None, None, None, None]

In [30]:
cons1_df['sent_tok_text'] = cons1_df['Q1_census_methods'].apply(lambda x: b_nlp.sent_tokenise_df(x))

In [33]:
#check
cons1_df['sent_tok_text'];

#### Calculate VADER polarity score for each sentence in each cell/answer

In [35]:
cons1_df['SA_scores_sents'] = cons1_df['sent_tok_text'].apply(lambda x: b_nlp.get_sentiment_score_df(x))

#### Take the mean polarity score for each cell (aggregating the scores of all the sentences within that cell)

In [36]:
# take the average for each cell/text
cons1_df['mean_SA_scores'] = cons1_df['SA_scores_sents'].apply(lambda x: np.mean(x))

#### Classify mean score as positive (if above 0) or negative (if below 0)

In [38]:
cons1_df['SA_polarity'] = cons1_df['mean_SA_scores'].apply(lambda x: 'pos' if x > 0 else 'neg' if x < 0 else "")

In [39]:
cons1_df.iloc[:, -3:].head(10)

Unnamed: 0,SA_scores_sents,mean_SA_scores,SA_polarity
0,,,
1,,,
2,,,
3,"[0.0, -0.4585]",-0.22925,neg
4,"[0.0, 0.3818, 0.0, 0.4404, 0.0, 0.0, 0.4404, 0...",0.257857,pos
5,"[0.743, 0.3818, 0.4588, 0.34, 0.7845]",0.54162,pos
6,,,
7,"[0.3612, -0.743, 0.0, 0.4939, 0.0, 0.6802, -0....",0.055771,pos
8,"[0.0, 0.3818, 0.3182, -0.6124]",0.0219,pos
9,"[0.0, -0.3182, 0.9513, 0.631, 0.5994, 0.5994]",0.410483,pos


In [40]:
cons1_df['SA_polarity'].value_counts()    #535 pos, 133 neg, ""440

pos    535
       440
neg    133
Name: SA_polarity, dtype: int64

#### Some crosstabulations between answer's polarity and respondent's main characteristics

In [41]:
# sample 100 positive and 100 negative sentence: 30% from organisations, 70% from individuals
pd.crosstab(cons1_df['SA_polarity'], cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'])

"Are you responding on behalf of an organisation, or as an individual?Response",Individual,Organisation
SA_polarity,Unnamed: 1_level_1,Unnamed: 2_level_1
,378,46
neg,99,33
pos,334,195


### 5. Create Stratified random sample for test data

In [42]:
# create strata
pos_org_cond = ((cons1_df['SA_polarity'] == 'pos') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Organisation'))
neg_org_cond = ((cons1_df['SA_polarity'] == 'neg') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Organisation'))

pos_ind_cond = ((cons1_df['SA_polarity'] == 'pos') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Individual'))
neg_ind_cond = ((cons1_df['SA_polarity'] == 'neg') & (cons1_df['Are you responding on behalf of an organisation, or as an individual?Response'] == 'Individual'))


In [43]:
# sample sentences
import random
random.seed(11)

pos_org_sample = cons1_df[pos_org_cond].sample(30)
neg_org_sample = cons1_df[neg_org_cond].sample(30)

pos_ind_sample = cons1_df[pos_ind_cond].sample(70)
neg_ind_sample = cons1_df[neg_ind_cond].sample(70)

In [44]:
print(pos_org_sample.shape[0])
print(neg_org_sample.shape[0])
print(pos_ind_sample.shape[0])
print(neg_ind_sample.shape[0])

30
30
70
70


In [45]:
datas = [pos_org_sample, neg_org_sample, pos_ind_sample, neg_ind_sample]

sa_q1_sample = pd.concat(datas)

In [46]:
sa_q1_sample = sa_q1_sample.iloc[:, [1, 40, 10, -7, -6, -5, -4, -3, -2, -1]]

In [47]:
# DO NOT RUN
#sa_q1_sample.to_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/sa_q1_sample.csv")

#### Add TextBlob polarity score

In [95]:
sa_q1_sample['sent_tok_text'] = sa_q1_sample['Q1_census_methods'].apply(lambda x: b_nlp.sent_tokenise_df(x))

In [99]:
sa_q1_sample['textblob_SA_score_sents'] = sa_q1_sample['sent_tok_text'].apply(lambda x: b_nlp.get_textblob_sentiment_score_df(x))

In [101]:
sa_q1_sample['textblob_mean_SA_score'] = sa_q1_sample['textblob_SA_score_sents'].apply(lambda x: np.mean(x))

In [103]:
sa_q1_sample['textblob_polarity'] = sa_q1_sample['textblob_mean_SA_score'].apply(lambda x: 'pos' if x > 0 else 'neg' if x < 0 else "")

In [109]:
sa_q1_sample[['textblob_polarity', 'SA_polarity', 'Q1_census_methods']];

In [105]:
sa_q1_sample.columns

Index(['Unnamed: 0', 'Respondent ID', 'Q1_census_methods',
       'Are you responding on behalf of an organisation, or as an individual?Response',
       'PublicSector', 'PrivateSector', 'OtherSectors', 'sent_tok_text',
       'SA_scores_sents', 'mean_SA_scores', 'SA_polarity',
       'textblob_SA_score_sents', 'textblob_mean_SA_score',
       'textblob_polarity'],
      dtype='object')

In [108]:
### re-save data
# DO NOT RUN
# sa_q1_sample.to_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/sa_q1_sample_2.csv")

#### Plot agreement between VADER and TextBlob polarity classification

In [117]:
pd.crosstab(sa_q1_sample['SA_polarity'], sa_q1_sample['textblob_polarity'])

textblob_polarity,Unnamed: 1_level_0,neg,pos
SA_polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
neg,4,22,74
pos,3,5,92


In [119]:
sa_q1_sample[sa_q1_sample['textblob_polarity'] == ""];
# 7 answers that received a 0 from TextBlob (clearical inspections: 4 correctly given 0)

In [120]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(sa_q1_sample['SA_polarity'], sa_q1_sample['textblob_polarity']) 

0.16908212560386471