In [None]:
import wikipedia
import codecs
import string

import pandas as pd
import numpy as np
import cufflinks as cf
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from urllib.parse import unquote
from wordcloud_fa import WordCloudFa
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

## Collect Data from Wikipedia Manually

After finding a question and an answer from an article in Wikipedia, assign each to their relevant variable.
Pay attention to do not run it again for a single set of values since it duplicates the dataset's rows.

In [None]:
df = pd.DataFrame(columns=['question', 'answer', 'link', 'paragraph_text'])

In [None]:
question = 'درمان نشانگان هلپ چگونه انجام می‌شود؟'
answer = '''فعلا، تنها و موثرترین درمان توصیه شده، زایمان است، زیرا باعث بهبود علایم می شود و با خروج جفت به تدریج علائم کاملا از بین می روند. زایمان سریع تنها گزینه قابل قبول در مواردی است که منجر به نارسایی چند ارگان شده، خونریزی رخ داده و یا خطر قابل توجه برای جنین وجود دارد.

بعضی از داروها نیز برای درمان علائم خاص استفاده می شوند.'''
link = 'https://fa.wikipedia.org/wiki/%D9%86%D8%B4%D8%A7%D9%86%DA%AF%D8%A7%D9%86_%D9%87%D9%84%D9%BE'
paragraph_text = '''فعلا، تنها و موثرترین درمان توصیه شده، زایمان است، زیرا باعث بهبود علایم می شود و با خروج جفت به تدریج علائم کاملا از بین می روند. زایمان سریع تنها گزینه قابل قبول در مواردی است که منجر به نارسایی چند ارگان شده، خونریزی رخ داده و یا خطر قابل توجه برای جنین وجود دارد.

بعضی از داروها نیز برای درمان علائم خاص استفاده می شوند.'''

In [None]:
df = df.append({
    'question': question, 
    'answer': answer,
    'link': link,
    'paragraph_text': paragraph_text}, ignore_index=True)

In [None]:
df.to_csv('../data/processed/1000.csv')  # saving data to a dataset

## Importing Crawled Dataset

In [None]:
df = pd.read_csv('../data/processed/1000.csv', index_col=0).reset_index(drop=True)

In [None]:
df = df.drop_duplicates(subset='question').reset_index(drop=True)

## Crawl Wikipedia's Pages Content, Categories and Title

In order to crawl categories, title and whole content of the relevant article for a row in the dataset, you should run the cell below. It will take a while to run so be aware of running it.

In [None]:
wikipedia.set_lang('fa')
wikipedia.set_rate_limiting(rate_limit=True)

df['categories'] = np.nan
df['categories'] = df['categories'].astype('object')

for i, v in df.iterrows(): 
    topic = df.iloc[i].link.replace('https://fa.wikipedia.org/wiki/', '')
    topic = unquote(topic)
    a = wikipedia.WikipediaPage(topic)
    df.loc[i, 'text'] = a.content
    df.loc[i, 'title'] = a.title
    df.at[i, 'categories'] = a.categories
    print('yay ' + str(i))

In [None]:
df.to_csv('../data/processed/1000.csv')  # saving the expanded dataset

## Removing Numbers, Punctuations and Stopwords

In [None]:
stopwords_file = open('../data/raw/stopwords.txt', 'r')
stopwords = list(map(lambda x: x.strip(), stopwords_file.readlines())) + ['ویکی', 'پدیا', 'جستارهای', 'وابسته', 'پدیای', 'دانشنامه']

In [None]:
def replace_punction_marks_and_numbers(word):
    punc = ['!','"','#','(',')','*',',','-','.','/',':','[',']','«','»','،','؛', '؟', '\n']
    nums = ['۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '۰']
    whole = punc + nums + list(string.printable)
    
    for i in whole:
        word = word.replace(i, ' ')
    return word

In [None]:
def remove_odds(sentence):
    words = sentence.split()
    result = []
    for word in words:
        word = word.replace('\u200c', ' ')
        word = word.replace('\u200e', ' ')
        result.append(replace_punction_marks_and_numbers(word))
    return ' '.join(' '.join(result).split())

In [None]:
df['answer'] = df['answer'].apply(lambda x: remove_odds(x))
df['question'] = df['question'].apply(lambda x: remove_odds(x))
df['text'] = df['text'].apply(lambda x: remove_odds(x))

## Two Simple Plots for Answers and Questions

In [None]:
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

df['answer_len'] = df['answer'].apply(len)
df['answer_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='answer length',
    linecolor='black',
    color='red',
    yTitle='count',
    title='Answer Length Distribution')

In [None]:
df['question_len'] = df['question'].apply(len)
df['question_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='quesiton length',
    linecolor='black',
    color='blue',
    yTitle='count',
    title='Question Length Distribution')

### Drawing Word Cloud for articles' content

In [None]:
def get_n_common_words(df, column, n=20):
    words = []
    rows = list(df[column].values)
    for i in rows:
        words.extend(i.split())
    return Counter(words).most_common(n)

In [None]:
ans_temp_df = pd.DataFrame(get_n_common_words(df, 'answer'), columns = ['word', 'count'])
q_temp_df = pd.DataFrame(get_n_common_words(df, 'question'), columns = ['word', 'count'])

In [None]:
text = ' '.join(df.text.values)
text2 = []
for t in text.split():
    if t not in stopwords:
        text2.append(t)
text2 = ' '.join(text2) 

In [None]:
wordcloud = WordCloudFa(width=1000, height=500, background_color='white')
wc = wordcloud.generate(text2)
image = wc.to_image()
image.show()
image.save('../data/visualized/wordcloud.png')

## Checking The Similarity Between Questions And Answers

Here, we are using TF-IDF and Cosine Similarity to check how much questions and answers are the same.
What about the similarity between answers? Or between questions?
Let's check it in the below.

In [None]:
answers = df.answer.to_list()
questions = df.question.to_list()
doc = answers + questions

In [None]:
vect = TfidfVectorizer()
tfidf = vect.fit_transform(doc)
a = (tfidf * tfidf.T).toarray()

In [None]:
indexes = []
shape = a.shape
for threshold in np.arange(0, 1, 0.1):
    for i in range(0, 1000):
        if a[i][i+1000] >= threshold:
            indexes.append((round(threshold, 1), i, i+1000))
            
plot_df = pd.DataFrame(indexes, columns=['similarity', 'a_index', 'q_index'])

plt.figure(figsize=(20,10))
color = sns.color_palette("husl", 9)[::-1]
sns.set_style("ticks",{'axes.grid' : True})

ax = sns.countplot(x="similarity", data=plot_df, palette=color,)
plt.savefig('../data/visualized/similarity_count.png')

In [None]:
indexes_a = []
shape = a.shape
for threshold in np.arange(0, 1, 0.1):
    for i in range(0, 1000):
        for j in range(0, 1000):
            if j != i:
                if a[i][j] >= threshold:
                    indexes_a.append((round(threshold, 1), i, j))
                    
plot_aa_df = pd.DataFrame(indexes_a, columns=['similarity', 'a_index', 'a_index2'])

plt.figure(figsize=(20,10))
color = sns.color_palette("husl", 9)[::-1]
sns.set_style("ticks",{'axes.grid' : True})

ax = sns.countplot(x="similarity", data=plot_aa_df, palette=color,)
plt.ticklabel_format(style='plain', axis='y')

plt.savefig('../data/visualized/similarity_aa_count.png')

In [None]:
indexes_q = []
shape = a.shape
for threshold in np.arange(0, 1, 0.1):
    for i in range(1000, 2000):
        for j in range(1000, 2000):
            if j != i:
                if a[i][j] >= threshold:
                    indexes_q.append((round(threshold, 1), i, j))

plot_qq_df = pd.DataFrame(indexes_q, columns=['similarity', 'q_index', 'q_index2'])

plt.figure(figsize=(20,10))
color = sns.color_palette("husl", 9)[::-1]
sns.set_style("ticks",{'axes.grid' : True})

ax = sns.countplot(x="similarity", data=plot_qq_df, palette=color,)
plt.ticklabel_format(style='plain', axis='y')

plt.savefig('../data/visualized/similarity_qq_count.png')