In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import re
import nltk
import csv
import time
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thiagoabreu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Leitura dos dados

In [3]:
df = pd.read_csv(r'..//data/raw/bq-results-20220725-121025-1658751889618.csv')
df.head()

Unnamed: 0,title,body,tags
0,Using Components folder instead of Pages,<p>With Blazor being component based and compo...,directory|architecture|components|blazor
1,Select data from sqlite3 before or after a cer...,<p>I wan to <strong>select</strong> the data b...,javascript|database|typescript|sqlite|typeorm
2,Listen to Firebase Firestore data changes for ...,<p>Let's say I have a Firebase firestore datab...,javascript|reactjs|firebase|react-native|googl...
3,How to decode a base64 image and getting It's ...,<p>newbie here. I've been working on an image ...,python|tensorflow|machine-learning|base64|fastapi
4,Pods not found while using kubectl port-forward,<p>I want to forward the ports</p>\n<pre><code...,kubernetes|kubectl


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   500000 non-null  object
 1   body    500000 non-null  object
 2   tags    499994 non-null  object
dtypes: object(3)
memory usage: 11.4+ MB


O dataset possui três atributos, todos do tipo objeto (sendo o atributo *tags*, uma classe).

Porém, há amostras com tags vazias, vamos checar quais são.

In [5]:
tags_vazias = df[df['tags'].isna() == True]
tags_vazias

Unnamed: 0,title,body,tags
45182,How to avoid null pointer exception from fires...,<p>I have a firebase application which loads p...,
222104,ruby function is returning nil when it should not,<p>I have a written a ruby code that take two ...,
327727,Map Interface Methods. first things first< I w...,<p>// --------------Map Interface Methods-----...,
375230,Nan loss value after few epochs with Contrasti...,<p>I used a Siamese network with contrastive l...,
387970,Why is the Button Null?,<p>I'm receiving a NullPointerException. It sa...,
394739,Why are NaNs produced for pchisq?,<p>i was using serial.test to check for autoco...,


In [6]:
round((len(tags_vazias)/len(df))*100, 5)

0.0012

Todas elas se tratam de questões específicas de tecnologias/linguagens, portanto o uso de tags seria recomendado nesses casos. Desse modo, como suas tags estão vazias e representam apenas 0.0012% de todas as amostras do dataset, podemos remover essas amostras do banco.

In [7]:
df_sem_vazios = df.dropna()
df_sem_vazios.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499994 entries, 0 to 499999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   499994 non-null  object
 1   body    499994 non-null  object
 2   tags    499994 non-null  object
dtypes: object(3)
memory usage: 15.3+ MB


Embora todos os dados sejam textuais, há a possibilidade de haver amostras duplicadas

In [8]:
df_sem_duplicados = df_sem_vazios.drop_duplicates()
df_sem_duplicados.count()

title    499994
body     499994
tags     499994
dtype: int64

Os valores da coluna body, por serem escritos originalmente em *Markdown*, possuem caracteres especiais (sequências ASCII) e tags HTML. É interessante removê-las para limpar o corpo das questões, pois esses caracteres e tags não influenciam na questão em si, apenas em sua formatação.

In [9]:
df_sem_duplicados['body'] = df_sem_duplicados['body'].str.replace(r'<[^<>]*>|\\', '', regex=True)
df_sem_duplicados['body'] = df_sem_duplicados['body'].str.replace(r'\n', ' ', regex=True)

In [10]:
df_sem_duplicados.head()

Unnamed: 0,title,body,tags
0,Using Components folder instead of Pages,With Blazor being component based and componen...,directory|architecture|components|blazor
1,Select data from sqlite3 before or after a cer...,I wan to select the data before or after a cer...,javascript|database|typescript|sqlite|typeorm
2,Listen to Firebase Firestore data changes for ...,Let's say I have a Firebase firestore database...,javascript|reactjs|firebase|react-native|googl...
3,How to decode a base64 image and getting It's ...,newbie here. I've been working on an image cla...,python|tensorflow|machine-learning|base64|fastapi
4,Pods not found while using kubectl port-forward,I want to forward the ports kubectl port-forwa...,kubernetes|kubectl


In [None]:
#Porter Stemmer object for stemming
ps = PorterStemmer()
#creating the corpus for all the articles
corpus = []
chunk_size = 10000#Preprocessing

for i in range(0,chunk_size):
    body = re.sub('[^a-zA-Z]', ' ', df['body'][i])
    title = re.sub('[^a-zA-Z]', ' ', df['title'][i])
    body = body.lower()
    title = title.lower()
    body = body.split()
    title = title.split()
    body = [ps.stem(word) for word in body if not word in set(stopwords.words('english')) and len(word) != 1] 
    title = [ps.stem(word) for word in title if not word in set(stopwords.words('english')) and len(word) != 1] 
    body = ' '.join(body)
    title = ' '.join(title)
    paragraph = body + title
    corpus.append(paragraph)



In [12]:
len(corpus)

10000

In [17]:

cv = CountVectorizer(max_df = 0.85, max_features = 1000)
word_count_vector_train = cv.fit_transform(corpus[:5000])#to see the words in the vocabulary use: list(cv.vocabulary_.keys())[:10]#calculate the IDF


<5000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 189320 stored elements in Compressed Sparse Row format>

In [None]:

#WARNING: ALWAYS USE IDF ON A LARGE CORPUS
#tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
#tfidf_transformer.fit(word_count_vector_train)# do this once, this is a mapping of index 

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
clf.fit(X_train_tfidf_vectorize, twenty_train.target)
view raw
