**Installation**

In [None]:
!pip install keras
!pip install nltk
!pip install hazm
!pip install -q clean-text[gpl]
!pip install -q hazm

**Import required packages**

In [2]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from hazm import *
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re
from string import punctuation
from cleantext import clean
import hazm

# Preparing Dataset

**Convert txt to csv**

In [48]:
file = open('Hamshahri-Corpus.txt', 'r')
file2 = open('Corpus.txt', 'w')
counter = 0
for line in file.read().splitlines():
  if ".Cat\t" in line and counter == 0:
    counter += 1
    file2.write(line.replace(".Cat\t", ""))
    file2.write(',,,,,,,,,,')

  elif ".Cat\t" in line and counter != 0:
    file2.write(',,,,,,,,,,')
    file2.write('\n')
    file2.write(line.replace(".Cat\t", ""))
    file2.write(',,,,,,,,,,')

  elif re.search('^[\u0600-\u06FF\s]', line):
  #elif re.search('[^ \u0622\u0627\u0628\u067E\u062A-\u062C\u0686\u062D-\u0632\u0698\u0633'
  #      '-\u063A\u0641\u0642\u06A9\u06AF\u0644-\u0648\u06CC\u200c]', line):
    file2.write(line)

In [None]:
df = pd.read_csv("Corpus.txt", sep=',,,,,,,,,,', lineterminator='\n', index_col=-1, header=None)
df.columns = ['Cat', 'Text']

# store dataframe into csv file
df.to_csv('Corpus.csv', index=None)
Corpus = pd.read_csv('Corpus.csv', sep=',', encoding='utf-8')

**Cleaning**

In [31]:
stop_words=[]
with open('PersianStopWords.txt') as f:
    lines = f.read().splitlines()
    for line in lines:
      line = re.sub("\n", "", line)
      stop_words.append(line)

In [32]:
from cleantext import clean
import hazm
def cleaning(text):

    if type(text) == str:
      text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=False,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url=" ",
        replace_with_email=" ",
        replace_with_phone_number=" ",
        replace_with_number=" ",
        replace_with_digit=" ",
        replace_with_currency_symbol=" ",
    )
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    text = ' '.join(word for word in text.split() if word not in stop_words) # remove stopwors from text
    
    text = re.sub(
        '[^ \u0622\u0627\u0628\u067E\u062A-\u062C\u0686\u062D-\u0632\u0698\u0633'
        '-\u063A\u0641\u0642\u06A9\u06AF\u0644-\u0648\u06CC\u200c]',
        "", text)
    
    # removing extra spaces
    text = re.sub("\s+", " ", text)

    text = bytes(text, 'utf-8').decode('utf-8', 'ignore')

    return text

In [33]:
Corpus['Text'] = Corpus['Text'].apply(cleaning)

In [34]:
Corpus['Cat'] = [word.lower() for word in Corpus['Cat']]

**Missing values**

In [None]:
print('data information')
print(Corpus.info(), '\n')

# print missing values information
print('missing values stats')
print(Corpus.isnull().sum(), '\n')

In [None]:
Corpus = Corpus.dropna(subset=['Cat'])
Corpus = Corpus.dropna(subset=['Text'])
Corpus = Corpus.drop_duplicates(subset=['Text'], keep='first')
Corpus = Corpus.reset_index(drop=True)

# print data information
print('data information')
print(Corpus.info(), '\n')

# print missing values information
print('missing values stats')
print(Corpus.isnull().sum(), '\n')

**Splitting the Dataset**

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each post.
MAX_SEQUENCE_LENGTH = 250
# The embedding dimension
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(Corpus['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(Corpus['Text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(Corpus['Cat']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)