In [133]:
import numpy as np
import pandas as pd
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import f1_score
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ganesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [104]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ganesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [105]:
df = pd.read_csv('blogtext.csv')
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
id        681284 non-null int64
gender    681284 non-null object
age       681284 non-null int64
topic     681284 non-null object
sign      681284 non-null object
date      681284 non-null object
text      681284 non-null object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [107]:
blog_df=df.head(5000)

In [108]:
blog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
id        5000 non-null int64
gender    5000 non-null object
age       5000 non-null int64
topic     5000 non-null object
sign      5000 non-null object
date      5000 non-null object
text      5000 non-null object
dtypes: int64(2), object(5)
memory usage: 273.5+ KB


# Preprocess rows of the “text” column

In [109]:
REMOVE_PUNCT = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', REMOVE_PUNCT))

blog_df["text_wo_punct"] = blog_df["text"].apply(lambda text: remove_punctuation(text))
blog_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,id,gender,age,topic,sign,date,text,text_wo_punct
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info has been found 100 pages and ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These are the team members Drewes...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks to Yahoos Toolbar I can no...


In [110]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
blog_df["text_wo_stop"] = blog_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
blog_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,gender,age,topic,sign,date,text,text_wo_punct,text_wo_stop
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info has been found 100 pages and ...,Info found 100 pages 45 MB pdf files Now wait ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These are the team members Drewes...,These team members Drewes van der Laag urlLink...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE EI...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks to Yahoos Toolbar I can no...,Thanks Yahoos Toolbar I capture URLs popupswhi...


In [111]:
cnt = Counter()
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])
blog_df["text_wo_stopfreq"] = blog_df["text_wo_stop"].apply(lambda text: remove_freqwords(text))
blog_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,id,gender,age,topic,sign,date,text,text_wo_punct,text_wo_stop,text_wo_stopfreq
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info has been found 100 pages and ...,Info found 100 pages 45 MB pdf files Now wait ...,Info found 100 pages 45 MB pdf files Now wait ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These are the team members Drewes...,These team members Drewes van der Laag urlLink...,These team members Drewes van der Laag urlLink...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusie op aarde MAAK JE EI...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks to Yahoos Toolbar I can no...,Thanks Yahoos Toolbar I capture URLs popupswhi...,Thanks Yahoos Toolbar I capture URLs popupswhi...


In [112]:
blog_df.drop(["text_wo_punct", "text_wo_stop"], axis=1, inplace=True)
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])
blog_df["text_wo_stopfreqrare"] = blog_df["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
blog_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,gender,age,topic,sign,date,text,text_wo_stopfreq,text_wo_stopfreqrare
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info found 100 pages 45 MB pdf files Now wait ...,Info found 100 pages 45 MB pdf files Now wait ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These team members Drewes van der Laag urlLink...,These team members Drewes van der Laag urlLink...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusie op aarde MAAK JE EI...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks Yahoos Toolbar I capture URLs popupswhi...,Thanks Yahoos Toolbar I capture URLs popupswhi...


In [113]:
lem = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lem.lemmatize(word) for word in text.split()])
blog_df["text_lemmatized"] = blog_df["text_wo_stopfreqrare"].apply(lambda text: lemmatize_words(text))
blog_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,gender,age,topic,sign,date,text,text_wo_stopfreq,text_wo_stopfreqrare,text_lemmatized
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info found 100 pages 45 MB pdf files Now wait ...,Info found 100 pages 45 MB pdf files Now wait ...,Info found 100 page 45 MB pdf file Now wait un...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These team members Drewes van der Laag urlLink...,These team members Drewes van der Laag urlLink...,These team member Drewes van der Laag urlLink ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusie op aarde MAAK JE EI...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks Yahoos Toolbar I capture URLs popupswhi...,Thanks Yahoos Toolbar I capture URLs popupswhi...,Thanks Yahoos Toolbar I capture URLs popupswhi...


In [114]:
stem = PorterStemmer()
def stem_words(text):
    return " ".join([stem.stem(word) for word in text.split()])
blog_df["text_preprocessed"] = blog_df["text_lemmatized"].apply(lambda text: stem_words(text))
blog_df.head

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


<bound method NDFrame.head of            id  gender  age              topic      sign            date  \
0     2059027    male   15            Student       Leo     14,May,2004   
1     2059027    male   15            Student       Leo     13,May,2004   
2     2059027    male   15            Student       Leo     12,May,2004   
3     2059027    male   15            Student       Leo     12,May,2004   
4     3581210    male   33  InvestmentBanking  Aquarius    11,June,2004   
5     3581210    male   33  InvestmentBanking  Aquarius    10,June,2004   
6     3581210    male   33  InvestmentBanking  Aquarius    10,June,2004   
7     3581210    male   33  InvestmentBanking  Aquarius    10,June,2004   
8     3581210    male   33  InvestmentBanking  Aquarius    10,June,2004   
9     3581210    male   33  InvestmentBanking  Aquarius    09,June,2004   
10    3581210    male   33  InvestmentBanking  Aquarius    09,June,2004   
11    3581210    male   33  InvestmentBanking  Aquarius    09,June,200

# As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence 

In [115]:
label_y = []

for row in blog_df.iterrows():
    row_labels = []
    row_labels.append(str(row[1][1]))
    row_labels.append(str(row[1][2]))
    row_labels.append(str(row[1][3]))
    row_labels.append(str(row[1][4]))
    label_y.append(row_labels)

blog_df['Labels'] = label_y
blog_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,id,gender,age,topic,sign,date,text,text_wo_stopfreq,text_wo_stopfreqrare,text_lemmatized,text_preprocessed,Labels
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info found 100 pages 45 MB pdf files Now wait ...,Info found 100 pages 45 MB pdf files Now wait ...,Info found 100 page 45 MB pdf file Now wait un...,info found 100 page 45 MB pdf file now wait un...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These team members Drewes van der Laag urlLink...,These team members Drewes van der Laag urlLink...,These team member Drewes van der Laag urlLink ...,these team member drew van der laag urllink ma...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusie op aarde MAAK JE EI...,In het kader van kernfusi op aard maak JE eige...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing,testing testing,test test,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks Yahoos Toolbar I capture URLs popupswhi...,Thanks Yahoos Toolbar I capture URLs popupswhi...,Thanks Yahoos Toolbar I capture URLs popupswhi...,thank yahoo toolbar I captur url popupswhich m...,"[male, 33, InvestmentBanking, Aquarius]"


# Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference

In [116]:
gender = blog_df['gender'].value_counts().to_dict()
print(gender)

{'male': 3294, 'female': 1706}


In [117]:
topics = blog_df['topic'].value_counts().to_dict()
print(topics)

{'Technology': 2332, 'indUnk': 1381, 'Student': 569, 'Engineering': 119, 'Education': 118, 'BusinessServices': 87, 'Sports-Recreation': 75, 'InvestmentBanking': 70, 'Communications-Media': 61, 'Non-Profit': 47, 'Science': 33, 'Arts': 31, 'Internet': 20, 'Consulting': 16, 'Banking': 16, 'Automotive': 14, 'Religion': 4, 'Law': 3, 'Accounting': 2, 'Museums-Libraries': 2}


In [118]:
age = blog_df['age'].value_counts().to_dict()
print(age)

{35: 2307, 34: 540, 24: 353, 15: 339, 17: 331, 25: 268, 14: 170, 23: 137, 33: 101, 26: 96, 27: 86, 39: 79, 16: 67, 36: 60, 37: 19, 41: 14, 45: 14, 42: 9, 46: 7, 44: 3}


In [119]:
blog_df = blog_df[["text_preprocessed", "Labels"]]
blog_df.head()

Unnamed: 0,text_preprocessed,Labels
0,info found 100 page 45 MB pdf file now wait un...,"[male, 15, Student, Leo]"
1,these team member drew van der laag urllink ma...,"[male, 15, Student, Leo]"
2,In het kader van kernfusi op aard maak JE eige...,"[male, 15, Student, Leo]"
3,test test,"[male, 15, Student, Leo]"
4,thank yahoo toolbar I captur url popupswhich m...,"[male, 33, InvestmentBanking, Aquarius]"


# Separate features and labels, and split the data into training and testing


In [120]:
X = blog_df['text_preprocessed']
y = blog_df['Labels']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Vectorize the features

In [122]:
cnt_vectorizer = CountVectorizer(ngram_range=(1,2))
X_train = cnt_vectorizer.fit_transform(X_train)
X_test = cnt_vectorizer.transform(X_test)

In [123]:
cnt_vectorizer.get_feature_names()

['000',
 '000 peopl',
 '0000',
 '0000 blink',
 '001',
 '001 first',
 '002',
 '002 middl',
 '003',
 '003 last',
 '004',
 '004 nicknam',
 '005',
 '005 gender',
 '006',
 '006 age',
 '007',
 '007 birthday',
 '007 game',
 '007 jersey',
 '008',
 '008 height',
 '009',
 '009 hair',
 '01',
 '01 2003',
 '01 bett',
 '01 mean',
 '01 mind',
 '010',
 '010 eye',
 '010203',
 '010203 heheh',
 '011',
 '011 race',
 '012',
 '012 glass',
 '012 last',
 '01234',
 '01234 but',
 '013',
 '013 dodid',
 '014',
 '014 is',
 '015',
 '015 where',
 '016',
 '016 current',
 '017',
 '017 zodiac',
 '018',
 '018 how',
 '019',
 '019 nation',
 '02',
 '02 ad',
 '02 and',
 '02 britney',
 '02 face',
 '02 lott',
 '02 republican',
 '02 where',
 '020',
 '020 bad',
 '020031',
 '020031 pm',
 '021',
 '021 it',
 '021 pierc',
 '02182004',
 '02182004 urllink',
 '022',
 '022 pierc',
 '02232004',
 '023',
 '023 tattoo',
 '024',
 '024 tattoo',
 '025',
 '025 today',
 '025613',
 '025613 pm',
 '026',
 '026 main',
 '026 the',
 '027',
 '027 read

# Transform the labels - (7.5 points)
As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
a.	Convert your train and test labels using MultiLabelBinarizer


In [125]:
mlb = MultiLabelBinarizer()
y_train_multi = mlb.fit_transform(y_train)
y_test_multi = mlb.transform(y_test)

# Choose a classifier - (5 points)
In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression. It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.
a.	Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label

In [127]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
classif = OneVsRestClassifier(clf)
classif.fit(X_train, y_train_multi)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='multinomial',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False),
                    n_jobs=None)

In [129]:
pred = classif.predict(X_test)

# Fit the classifier, make predictions and get the accuracy

In [130]:
results = multilabel_confusion_matrix(y_test_multi, pred) 
print('Confusion Matrix :')
print(results)
print('Report : ')
print(classification_report(y_test_multi, pred))

Confusion Matrix :
[[[ 961    2]
  [  32    5]]

 [[ 922   10]
  [  46   22]]

 [[ 982    0]
  [  18    0]]

 [[ 916   10]
  [  55   19]]

 [[ 959   12]
  [  29    0]]

 [[ 926    5]
  [  50   19]]

 [[ 945    9]
  [  39    7]]

 [[ 978    0]
  [  22    0]]

 [[ 978    1]
  [  15    6]]

 [[ 978    1]
  [  12    9]]

 [[ 882    9]
  [  38   71]]

 [[ 452  100]
  [  48  400]]

 [[ 987    4]
  [   8    1]]

 [[ 998    0]
  [   2    0]]

 [[ 982    0]
  [  16    2]]

 [[ 999    0]
  [   1    0]]

 [[ 997    0]
  [   3    0]]

 [[1000    0]
  [   0    0]]

 [[ 996    1]
  [   3    0]]

 [[ 997    1]
  [   2    0]]

 [[1000    0]
  [   0    0]]

 [[ 918   13]
  [  46   23]]

 [[ 404  111]
  [  59  426]]

 [[ 994    0]
  [   6    0]]

 [[ 996    0]
  [   4    0]]

 [[ 995    0]
  [   5    0]]

 [[ 984    2]
  [  12    2]]

 [[ 985    0]
  [  13    2]]

 [[ 980    0]
  [  10   10]]

 [[ 988    1]
  [   9    2]]

 [[ 994    1]
  [   5    0]]

 [[ 969    2]
  [  24    5]]

 [[ 969    5]
  [  19

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [131]:
print('Accuracy Score :',accuracy_score(y_test_multi, pred))

Accuracy Score : 0.502
