Text Preprocessing

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import string

import numpy as np
import seaborn as sns
import itertools 
import csv
import collections
import matplotlib.pyplot as plt

sns.set_context("paper")
%matplotlib inline

RES_DIR = "../input/"

Datasets loading
---------

In [None]:
# Load train data (skips the content column)
def load_train_data():
    categories = ['cooking', 'robotics', 'travel', 'crypto', 'diy', 'biology']
    train_data = []
    for cat in categories:
        data = pd.read_csv("{}{}.csv".format(RES_DIR, cat), usecols=['id', 'title', 'tags'])
        data['category'] = cat
        train_data.append(data)
    
    return pd.concat(train_data)

In [None]:
train_data = load_train_data()
#import the test data
test = pd.read_csv("../input/test.csv")
train_data.head()
test.head()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

convec = CountVectorizer(max_df=0.95, min_df=2,stop_words='english')
corpus = test['content'].values
ldavec = LatentDirichletAllocation( max_iter=5,learning_method='online',learning_offset=50.,random_state=0)
Xtf = convec.fit_transform(corpus)
Ytf = ldavec.fit(Xtf)
tf_names = Xtf.get_feature_names()
print(tf_names)


Removing html tags and uris from contents
-----------

In [None]:
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [None]:
# This could take a while
train_data["title"] = train_data["title"].map(stripTagsAndUris)
test["content"] = test["content"].map(stripTagsAndUris)

In [None]:
train_data.head()
test.head()

Removing punctuation from titles and contents
-----------

In [None]:
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [None]:
train_data["title"] = train_data["title"].map(removePunctuation)
test["title"] = test["title"].map(removePunctuation)
test["content"] = test["content"].map(removePunctuation)

In [None]:
train_data.head()
test.head()

Removing stopwords from titles and contents
-----------

In [None]:
stops = set(stopwords.words("english"))
def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)

In [None]:
    train_data["title"] = train_data["title"].map(removeStopwords)
    test["title"] = test["title"].map(removeStopwords)
    test["content"] = test["content"].map(removeStopwords)

In [None]:
test.head()

Splitting tags string in a list of tags
-----------

In [None]:
# Summary about tags
tag_lists = [t.split() for t in train_data['tags'].values]
tag_lists2 = [t.split() for t in train_data['title'].values]
all_tags = list(itertools.chain(*tag_lists,*tag_lists2))
tag_list_size = np.array([len(x) for x in tag_lists])
print("""The corpus is composed by {} questions. Overall {} tags have been used, of which {} unique ones. 
Average number of tags per question {:.2f} (min={}, max={}, std={:.2f})""".format(
    len(train_data),
    len(all_tags), len(set(all_tags)),
    tag_list_size.mean(), 
    min(tag_list_size), max(tag_list_size),
    tag_list_size.std()))

In [None]:
# Utility function to return top occuring tags in the passed df
def get_top_tags(df, n=None):
    itag_lists = [t.split() for t in df['tags'].values]
    itag_lists2 = [t.split() for t in df['title'].values]
    tags = list(itertools.chain(*itag_lists,*itag_lists2))
    top_tags = collections.Counter(list(tags)).most_common(n)
    tags, count = zip(*top_tags)
    return tags, count
# Utility function to return top occuring tags in the passed df

In [None]:
# Created DataFrame indexed on tags
tags_df = pd.DataFrame(index=set(itertools.chain(*tag_lists,*tag_lists2)))
# For each category create a column and update the flag to tag count
for i, (name, group) in enumerate(train_data.groupby('category')):
    tags_df[name] = 0
    tmp_index, count = get_top_tags(group)
    tmp = pd.Series(count, index=tmp_index)
    tags_df[name].update(tmp)
# Number of categories for which a tag appeared at least 1 time
tags_df['categories_appears'] = tags_df.apply(lambda x: x.astype(bool).sum(), axis=1)
tags_df['categories_appears'].value_counts()

In [None]:
# viewing the table of tags
from sklearn import preprocessing
A=tags_df
del A['categories_appears']
A_n = preprocessing.normalize(A, norm='l2')
print(A_n)

#Solving the question with a Singular Value Decomposition, 
#this is the core function
-----------

In [None]:
from numpy.linalg import inv
U,s,V=np.linalg.svd(A,full_matrices=False)
# reconstruct
S=np.diag(s)

iS=inv(S)
US=np.dot(U,iS)
US
# A fill up with US matrix
US_df=pd.DataFrame(data=US, index=tags_df.index, columns=tags_df.columns)
# with this simple math i know all the relations between all the tags and the documents
# 

In [None]:
#learn how to use dataframes...  and yes the algorithm knows extreme tourism antarctica has something to do with travel...
df1=US_df['extreme-tourism':'extreme-tourism':]
df2=US_df['antarctica':'antarctica':]
frames = [df1,df2]
Qtemp=pd.concat(frames).sum()
np.dot(Qtemp,V)/np.dot(np.abs(Qtemp),np.abs(V))

the training tells me its 100% travel, and 20% biology wtf ?

In [None]:
def taggifytitle(x):
    tempspl = x.strip().split() 
    Qtemp=newDF
    for sword in tempspl:
        if sword in US_df.index:
            #print(US_df.loc[sword:sword,:])
            Qtemp=Qtemp.append(US_df.loc[sword:sword,:])
            #print(Qtemp)
    simila=np.dot(Qtemp.sum(),V)/np.dot(np.abs(Qtemp.sum()),np.abs(V))
    tempprnt=''
    for xyb in range(0,5):
        if simila[xyb]>0.89 or simila[xyb]==np.amax(simila[0:5]):
            tempprnt+=columns[xyb]+' '
    
    return tempprnt    

In [None]:
columns = ['biology','cooking','crypto','diy','robotics','travel']
#,'categories_appears']
data = {'biology': [0],'cooking': [0],'crypto': [0],'diy': [0],'robotics': [0],'travel': [0],'categories_appears': [0]}
newDF = pd.DataFrame(data, columns=columns,index = ['blanco'])
#print(newDF)
stukjetesten=test['content'][0:100]
stukjetesten.map(taggifytitle)

    

In [None]:
test.to_csv("test_SVDPaul.csv", index=False)