In [1]:
import pandas as pd

Datasets - https://www.kaggle.com/stackoverflow/statsquestions

In [2]:
df_questions = pd.read_csv('datasets/statsquestions/Questions.csv', 
                           encoding='iso-8859-1')

In [3]:
df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n..."


In [4]:
len(df_questions)

85085

In [5]:
len(df_questions.Id.unique())

85085

#### Trimming the dataset
We whittle down to 5000 samples first. We'll apply additional filters later on

In [6]:
df_questions = df_questions[:5000]

In [7]:
df_questions.iloc[3]['Body']

'<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests.  It seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the results.  Many computerized tools report test results in terms of "p values" or "t values".</p>\n\n<p>How would you explain the following points to college students taking their first course in statistics:</p>\n\n<ul>\n<li><p>What does a "p-value" mean in relation to the hypothesis being tested?  Are there cases when one should be looking for a high p-value or a low p-value?</p></li>\n<li><p>What is the relationship between a p-value and a t-value?</p></li>\n</ul>\n'

In [8]:
df_questions.iloc[3]['Id']

31

In [9]:
df_tags = pd.read_csv('datasets/statsquestions/Tags.csv', 
                      encoding='iso-8859-1')

In [10]:
df_tags.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


#### The tags associated with the questions we looked at

In [11]:
df_tags[df_tags['Id'] == 31]

Unnamed: 0,Id,Tag
42,31,hypothesis-testing
43,31,t-test
44,31,p-value
45,31,interpretation
46,31,intuition


In [12]:
len(df_tags)

244228

In [13]:
len(df_tags.Tag.unique())

1315

In [14]:
grouped_tags = df_tags.groupby("Tag", sort='count') \
                      .size() \
                      .reset_index(name='count') 

In [15]:
grouped_tags.sample(10)

Unnamed: 0,Tag,count
481,gradient-descent,277
1041,reproducibility,5
239,coverage-probability,19
396,fallacy,8
783,multivariate-regression,131
1223,tobit-regression,65
456,genetic-algorithms,82
50,autocorrelation,678
983,quasi-monte-carlo,11
1030,relative-risk,61


#### We pick the 10 most popular tags
Later, we will filter out questions which do not have these popular tags associated with them

In [16]:
num_classes = 10

most_common_tags = grouped_tags.nlargest(num_classes, columns="count")

most_common_tags

Unnamed: 0,Tag,count
986,r,13236
1020,regression,10959
669,machine-learning,6089
1220,time-series,5559
946,probability,4217
521,hypothesis-testing,3869
1096,self-study,3732
317,distributions,3501
657,logistic,3316
155,classification,2881


In [17]:
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag \
                                if tag in most_common_tags.Tag.values \
                                else None)

In [18]:
df_tags = df_tags.dropna()

#### The number of entries has whittled down
Compare with 244228 previously

In [19]:
len(df_tags)

57359

In [20]:
df_tags.head(5)

Unnamed: 0,Id,Tag
3,2,distributions
7,4,distributions
9,6,machine-learning
29,23,distributions
33,25,time-series


#### Cleaning up the questions

In [21]:
import re 

def strip_html_tags(body):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', body)

In [22]:
df_questions['Body'] = df_questions['Body'].apply(strip_html_tags)

In [23]:
df_questions['Body'].head(5)

0    Last year, I read a blog post from Brendan O'C...
1    What are some of the ways to forecast demograp...
2    How would you describe in plain English the ch...
3    After taking a statistics course and then tryi...
4    There is an old saying: "Correlation does not ...
Name: Body, dtype: object

#### Combine the Title and Body fields
This becomes a single field called Text

In [24]:
df_questions['Text'] = df_questions['Title'] + ' ' + df_questions['Body']

#### Trim out unwanted columns
We only need the Id and Text columns

In [25]:
df_questions = df_questions.drop(['Title', 
                                  'Body', 
                                  'OwnerUserId', 
                                  'CreationDate', 
                                  'Score'], axis=1)

In [26]:
df_questions.sample(5)

Unnamed: 0,Id,Text
3057,212042,Elasticity vs marginal effects in probit model...
1006,211752,ROC plot not in one curve but scattered I'm ne...
3901,135967,Choosing right set of variables for Logistic r...
900,229465,Understanding multinomial regression I have da...
2784,134241,How to generate sorted uniformly distributed v...


In [27]:
top_tags = df_tags.Tag.unique()
top_tags

array(['distributions', 'machine-learning', 'time-series',
       'hypothesis-testing', 'r', 'classification', 'regression',
       'probability', 'logistic', 'self-study'], dtype=object)

#### Add a column containing a list of associated tags
Create a function to lookup the df_tags dataframe for associated tags for each row in the questions dataframe. We also <b>effectively perform a one-hot encoding</b> of the tag by setting 1 in a column corresponding to the tag name

In [28]:
def add_tag_column(row):
    
    all_tags_per_question = df_tags[df_tags['Id'] == row['Id']].Tag.values
    row['Tags'] = all_tags_per_question
    
    for tag in top_tags:
        if tag in all_tags_per_question:
            row[str(tag)] = 1
        else:
            row[str(tag)] = 0
    
    return row

In [29]:
df_questions_final = df_questions.apply(add_tag_column, axis=1)

In [30]:
df_questions_final.sample(10)

Unnamed: 0,Id,Text,Tags,distributions,machine-learning,time-series,hypothesis-testing,r,classification,regression,probability,logistic,self-study
107,109213,ANOVA data prep - sum across I have an experim...,[],0,0,0,0,0,0,0,0,0,0
1166,229490,Modelling the distribution of race results in ...,[distributions],1,0,0,0,0,0,0,0,0,0
4707,222670,Model to determine how many actions someone ne...,[],0,0,0,0,0,0,0,0,0,0
17,203,Group differences on a five point Likert item ...,[],0,0,0,0,0,0,0,0,0,0
92,94521,How to interpret results if a reference catego...,"[regression, logistic]",0,0,0,0,0,0,1,0,1,0
1495,179431,Statistical Significance Test With Evolving Gr...,[],0,0,0,0,0,0,0,0,0,0
1324,1357,"Is mutual information invariant to scaling, i....",[],0,0,0,0,0,0,0,0,0,0
152,643,How do you convey the beauty of the Central Li...,[],0,0,0,0,0,0,0,0,0,0
3331,71074,Compare different statistical models that fore...,[],0,0,0,0,0,0,0,0,0,0
1076,57882,Lmer model design My data is a series of repea...,[r],0,0,0,0,1,0,0,0,0,0


#### Remove questions with no associated tags

In [31]:
def remove_zero_tags(row):
    if len(row['Tags']) == 0 :
        return None
    else :
        return row 

In [32]:
df_questions_final = df_questions_final.apply(remove_zero_tags, axis=1)

In [33]:
df_questions_final = df_questions_final.dropna()

#### Number of questions is now much smaller

In [34]:
len(df_questions_final)

2631

In [35]:
df_questions_final.sample(10)

Unnamed: 0,Id,Text,Tags,distributions,machine-learning,time-series,hypothesis-testing,r,classification,regression,probability,logistic,self-study
3643,103939.0,Why doesn't Wilks' 1938 proof work for misspec...,[hypothesis-testing],0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4734,44632.0,Feature importance The extremely randomized tr...,[classification],0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1692,211835.0,How to draw a Shepard diagram from PCA scores?...,[r],0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3339,121998.0,"75% success, ranges for given sample sizes wit...","[probability, self-study]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2516,135804.0,"combining 4 sets of data, which are collected ...",[self-study],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4312,106069.0,Predicting on data consisting of many independ...,[time-series],0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1759,82129.0,Determining if experiment data can be pooled f...,[regression],0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2870,27255.0,Testing samples against a distribution Suppose...,"[distributions, hypothesis-testing]",1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2409,105509.0,Integrating an empirical CDF I have an empiric...,[r],0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4557,106170.0,Faith in an extrapolated result I would like t...,[regression],0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
train, test = train_test_split(df_questions_final, 
                               random_state=42, 
                               test_size=0.33, 
                               shuffle=True)

In [38]:
train.head()

Unnamed: 0,Id,Text,Tags,distributions,machine-learning,time-series,hypothesis-testing,r,classification,regression,probability,logistic,self-study
2989,186438.0,Simple Null Hypothesis Question Say you are re...,"[regression, hypothesis-testing, self-study]",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3900,122060.0,Assessing predictor contribution to model outp...,[machine-learning],0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2122,121752.0,What Ratio of Independent Distributions gives ...,"[probability, distributions]",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1638,156834.0,How to calculate deviance explained for a gene...,[r],0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4320,210598.0,"Order Statistics, Expected Value of range, $E(...","[probability, self-study, distributions]",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [39]:
X_train = train.Text
X_test = test.Text

X_train.shape, X_test.shape

((1762,), (869,))

### The OneVsRest classifier
Also known as one-vs-all, this strategy consists in fitting one classifier per class

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

Corpse - http://www.nltk.org/nltk_data/

In [41]:
stopwords_df = pd.read_csv("datasets/stopwords/english", 
                           engine='python', 
                           header=None,
                           usecols=[0], 
                           names=['words'])

In [42]:
stopwords_df.head().values

array([['i'],
       ['me'],
       ['my'],
       ['myself'],
       ['we']], dtype=object)

In [43]:
stop_words = set(stopwords_df.words.unique())

In [44]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
            ])

In [45]:
from sklearn.metrics import accuracy_score

In [50]:
for tag in top_tags:
    print('Processing {} ...'.format(tag))

    SVC_pipeline.fit(X_train, train[tag])

    prediction = SVC_pipeline.predict(X_test)
    
    print('Test accuracy is {} \n'.format(accuracy_score(test[tag], prediction)))

Processing distributions ...
Test accuracy is 0.9390103567318757 

Processing machine-learning ...
Test accuracy is 0.9010356731875719 

Processing time-series ...
Test accuracy is 0.9401611047180668 

Processing hypothesis-testing ...
Test accuracy is 0.9378596087456847 

Processing r ...
Test accuracy is 0.8457997698504027 

Processing classification ...
Test accuracy is 0.956271576524741 

Processing regression ...
Test accuracy is 0.8665132336018412 

Processing probability ...
Test accuracy is 0.9159953970080552 

Processing logistic ...
Test accuracy is 0.9470655926352128 

Processing self-study ...
Test accuracy is 0.9321058688147296 

