In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib as plt
%matplotlib inline

In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gobinathvelusamy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

 1. Read and Analyse Dataset. 

A. Clearly write outcome of data analysis

In [3]:
df = pd.read_csv('blogtext.csv')

In [4]:
df.shape

(681284, 7)

In [5]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [6]:
df.drop(['id','date'], axis=1, inplace=True)

In [7]:
df.describe(include='all')

Unnamed: 0,gender,age,topic,sign,text
count,681284,681284.0,681284,681284,681284
unique,2,,40,12,611652
top,male,,indUnk,Cancer,urlLink
freq,345193,,251015,65048,445
mean,,23.932326,,,
std,,7.786009,,,
min,,13.0,,,
25%,,17.0,,,
50%,,24.0,,,
75%,,26.0,,,


In [8]:
print('Unique values in the column gender are',df['gender'].nunique(),'\n')
print('Unique values in the column age are',df['age'].nunique(),'\n')
print('Unique values in the column topic are',df['topic'].nunique(),'\n')
print('Unique values in the column sign are',df['sign'].nunique())

Unique values in the column gender are 2 

Unique values in the column age are 26 

Unique values in the column topic are 40 

Unique values in the column sign are 12


1.Dataset has 7 Columns.Here 2 Columns id ,date is not required for further processing,sw we are dropping
2.Text column will be X for our analysis
3.Remaining columns gender,age,topic,sign will be labelled and used as Y 
4.Labelled records unique records mentined above

In [9]:
#Restrict data for running 
df=df.sample(10000)

In [10]:
df.shape

(10000, 5)

B. Clean the Structured 

i. Missing value analysis and imputation.

In [11]:
df.isna().any()


gender    False
age       False
topic     False
sign      False
text      False
dtype: bool

There is no Missing values in data 

ii. Eliminate Non-English textual data. 

In [13]:
pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [12]:
from langdetect import detect

def detect_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [14]:
df = df[df['text'].apply(detect_english)]


In [15]:
df.shape

(9565, 5)

In [None]:
#35 non English records dropped

2. Preprocess unstructured data to make it consumable for model training. 

A. Eliminate All special Characters and Numbers

In [16]:
pattern = "[^\w ]"
df.text = df.text.apply(lambda s : re.sub(pattern,"",s))

B. Lowercase all textual data 

In [17]:
df.text = df.text.apply(lambda s: s.lower())


C. Remove all Stopwords 

In [18]:
#remove stopwords
stopwords=set(stopwords.words('english'))
df.text = df.text.apply(lambda t: ' '.join([words for words in t.split() if words not in stopwords]) )

D. Remove all extra white spaces 

In [19]:
df.text = df.text.apply(lambda s: s.strip())


 3. Build a base Classification model 

A. Create dependent and independent variables 

In [20]:
df['labels']=df.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)

In [21]:
df=df[['text','labels']]

In [22]:
X=df['text']
Y=df['labels']


In [23]:
df.head()

Unnamed: 0,text,labels
172659,lookie created blogi actually get offline onee...,"[female, 16, Sports-Recreation, Pisces]"
166446,bill clinton toronto yesterday signing copies ...,"[male, 37, Marketing, Gemini]"
546322,new incoming supply currency ive bit gaming bi...,"[male, 16, indUnk, Libra]"
219708,interesting hit morning search knitting seroto...,"[female, 38, indUnk, Virgo]"
235193,standard email mum england reads like took awa...,"[female, 25, Engineering, Scorpio]"


B. Split data into train and test.

C. Vectorize data using any one vectorizer.

Lets perform count vectorizer with bi-grams and tri-grams to get the count vectors of the X data¶


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(binary=True, ngram_range=(1,2))
X=vectorizer.fit_transform(X)

In [25]:
X[1]

<1x814319 sparse matrix of type '<class 'numpy.int64'>'
	with 276 stored elements in Compressed Sparse Row format>

In [27]:
label_counts=dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
    else:
        label_counts[label]=1

In [28]:
label_counts

{'Pisces': 1,
 'Gemini': 1,
 'Libra': 1,
 'Virgo': 1,
 'Scorpio': 1,
 'Aries': 1,
 'Capricorn': 1,
 'Cancer': 1,
 'Aquarius': 1,
 'Sagittarius': 1,
 'Taurus': 1,
 'Leo': 1}

In [29]:
#Preprocssing
from sklearn.preprocessing import MultiLabelBinarizer
binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))

In [30]:
Y=binarizer.fit_transform(df.labels)

In [31]:
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

B. Split data into train and test. 

In [32]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2)

In [33]:
print("Shape of X_train",Xtrain.shape)
print("Shape of X_test",Xtest.shape)
print("Shape of y_train",Ytrain.shape)
print("Shape of y_test",Ytest.shape)

Shape of X_train (7652, 814319)
Shape of X_test (1913, 814319)
Shape of y_train (7652, 12)
Shape of y_test (1913, 12)


D. Build a base model for Supervised Learning - Classification.

In [34]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [35]:
model=LogisticRegression(solver='lbfgs')
model=OneVsRestClassifier(model)


In [36]:
model.fit(Xtrain,Ytrain)

In [37]:
Ypred=model.predict(Xtest)

In [38]:
Ypred_inversed = binarizer.inverse_transform(Ypred)
y_test_inversed = binarizer.inverse_transform(Ytest)

In [39]:
for i in range(5):
    print('Text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        Xtest[i],
        ','.join(y_test_inversed[i]),
        ','.join(Ypred_inversed[i])
    ))

Text:	  (0, 404579)	1
  (0, 480085)	1
  (0, 706817)	1
  (0, 707350)	1
  (0, 712032)	1
  (0, 812011)	1
  (0, 704347)	1
  (0, 522190)	1
  (0, 772642)	1
  (0, 734733)	1
  (0, 245418)	1
  (0, 281926)	1
  (0, 155655)	1
  (0, 212755)	1
  (0, 440910)	1
  (0, 273316)	1
  (0, 421247)	1
  (0, 649228)	1
  (0, 649388)	1
  (0, 707436)	1
  (0, 190824)	1
  (0, 142610)	1
  (0, 407929)	1
  (0, 348975)	1
  (0, 480444)	1
  (0, 324254)	1
  (0, 324279)	1
  (0, 441477)	1
  (0, 812166)	1
  (0, 282026)	1
  (0, 523651)	1
  (0, 441254)	1
  (0, 712416)	1
  (0, 273668)	1
  (0, 707058)	1
  (0, 190849)	1
  (0, 348980)	1
  (0, 156500)	1
  (0, 735043)	1
  (0, 772848)	1
  (0, 142710)	1
  (0, 704646)	1
  (0, 212937)	1
  (0, 282236)	1
  (0, 245596)	1
True labels:	Aries
Predicted labels:	


Text:	  (0, 279681)	1
  (0, 477114)	1
  (0, 404579)	1
  (0, 364882)	1
  (0, 346483)	1
  (0, 390218)	1
  (0, 293884)	1
  (0, 244674)	1
  (0, 500405)	1
  (0, 735913)	1
  (0, 706817)	1
  (0, 705675)	1
  (0, 195917)	1
  (0, 689863)	1
  (0

E. Clearly print Performance Metrics.

In [40]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(Ytest, Ypred):
    print('Accuracy score: ', accuracy_score(Ytest, Ypred))
    print('F1 score: ', f1_score(Ytest, Ypred, average='micro'))
    print('Average precision score: ', average_precision_score(Ytest, Ypred, average='micro'))
    print('Average recall score: ', recall_score(Ytest, Ypred, average='micro'))

In [41]:
print_evaluation_scores(Ytest, Ypred)

Accuracy score:  0.0036591740721380033
F1 score:  0.007212776919113859
Average precision score:  0.08394319567868966
Average recall score:  0.0036591740721380033


4. Improve Performance of model.

A. Experiment with other vectorisers. (using word2vec/GLOV)

Which vectorizer performed better? Probable reason?