<a href="https://colab.research.google.com/github/felanbi/Projet-OC-Categoriser-automatiquement-des-questions/blob/main/P4_01_notebookexploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path_queries = 'drive/MyDrive/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import IPython.display
import os
import nltk
import re
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import FreqDist

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
sql_query = '''
SELECT
  Id,
  Title,
  Body,
  Tags
FROM 
  Posts
WHERE 
  Tags != ''
  and CreationDate >= '2020-01-01' 
  and Score > 0
  and ViewCount >= 100
  and AnswerCount > 0
'''

In [18]:
data = pd.DataFrame()

print('Lecture des fichiers:\n')
for f in os.listdir(path_queries):
    if 'query' in f:
        print('Fichier en cours de chargement: ' + f)
        data = pd.concat([data,
                          pd.read_csv(path_queries + f, index_col = 0)])

data.columns = [col.lower() for col in data.columns]
data.head()

Lecture des fichiers:

Fichier en cours de chargement: query_2021_1.csv
Fichier en cours de chargement: query_2021_2.csv
Fichier en cours de chargement: query_2020_1.csv
Fichier en cours de chargement: query_2020_2.csv
Fichier en cours de chargement: query_2020_3.csv
Fichier en cours de chargement: query_2020_4.csv
Fichier en cours de chargement: query_2020_5.csv
Fichier en cours de chargement: query_2020_6.csv
Fichier en cours de chargement: query_2020_7.csv
Fichier en cours de chargement: query_2020_8.csv


Unnamed: 0_level_0,title,body,tags
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
65526399,Groovy script code to append xml node taking 1...,<p>Below code is working but taking 15+ hours ...,<javascript><xml><groovy><xml-parsing><sap-clo...
65526401,How do I make a class instance using user input?,<p>I am making a text based adventure game in ...,<python>
65526407,"How to remove Branding WHMCS Ver 8.1 ""Powered ...",<p>I am a newbie and this is the first time ev...,<php><jquery><css><templates><whmcs>
65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,<c++><std><c++20><allocator>
65526447,React Router v5.2 - Blocking route change with...,<p>My app has two pages: <code>Step1</code> an...,<javascript><reactjs><react-router><react-rout...


In [23]:
df = data.copy()

df.tags = df.tags.map(lambda x: x[1:-1].replace('><', ' ').split())

tags_extended = [x for liste in df.tags for x in liste]
tags = set(tags_extended)

# top 20 tags
top_tags = [x[0] for x in FreqDist(tags_extended).most_common(20)]
df.tags = df.tags.map(lambda x: [t for t in x if t in top_tags])

df = df[df.tags.map(len) != 0]
df = df.head(50000)
df.head()

Unnamed: 0_level_0,title,body,tags
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
65526399,Groovy script code to append xml node taking 1...,<p>Below code is working but taking 15+ hours ...,[javascript]
65526401,How do I make a class instance using user input?,<p>I am making a text based adventure game in ...,[python]
65526407,"How to remove Branding WHMCS Ver 8.1 ""Powered ...",<p>I am a newbie and this is the first time ev...,"[php, css]"
65526419,How can I construct my objects allocated throu...,<p>C++20 removed the <code>construct()</code> ...,[c++]
65526447,React Router v5.2 - Blocking route change with...,<p>My app has two pages: <code>Step1</code> an...,"[javascript, reactjs]"


In [24]:
def clean_text(text, tags, html = False):
    if html:
      # Getting text from raw html file if it is
      from bs4 import BeautifulSoup
      soup = BeautifulSoup(text, 'html.parser')
      text = soup.get_text()
    
    # Lowering all words in text
    text = text.lower()
    
    # On sépare le texte pour ne garder que les tags et le texte sans les tags
    text_split = text.split()
    tags_in_text = [w for w in text_split if w in tags]
    text = [w for w in text_split if w not in tags]
    
    # Removing non letters (unless if word is not in tags) :
    text = ' '.join(text)
    text = re.sub('[^a-zA-Z]', ' ', text) 
    
    # On retire les mots inférieurs à 3 lettres qui n'apportent pas énormément d'information
    text = text.split()
    text = [w for w in text if len(w) > 3]
    
    text = ' '.join(text)
    text += ' '.join(tags_in_text) 
    
    return text

df.title = df.title.map(lambda x: clean_text(x, tags))
df.body = df.body.map(lambda x: clean_text(x, tags, html = True))

In [25]:
df['post'] = df.title + df.body
df.drop(columns = ['title', 'body'], inplace = True)
df = df[['post', 'tags']]

In [26]:
from nltk.stem import WordNetLemmatizer

def tokenizer(post, tags, lemmatizer = False, without_stopwords = False):
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords

    tags_in_post = [tag for tag in post.split()]
    post = [w for w in post.split() if w not in tags]
    post = ' '.join(post)

    token = word_tokenize(post)

    if without_stopwords:
      token = [w for w in token if w not in stopwords.words('english')]

    if lemmatizer:
      token = [lemmatizer.lemmatize(w) for w in token]

    return token + tags_in_post

df.post = df.post.map(lambda x: tokenizer(x, tags, lemmatizer = WordNetLemmatizer(), without_stopwords = True))

In [None]:
word_corpus = [token for token_list in df.post for token in token_list]
vocabulary = [x[0] for x in FreqDist(word_corpus).most_common(10000)]
df.post = df.post.map(lambda x: [t for t in x for t in vocabulary])
df = df[df.post.map(len) != 0]
df.head()