In [2]:
import pandas as pd
import re
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from hazm import Normalizer, word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from hazm import WordTokenizer , Lemmatizer,stopwords_list

# Extracting the data

### I need a dictionary of lists in such format:
- "DID" : list of all DIDs
- "Date": list of all dates
- "Cat": list of all categories
- "Content": list of all contents <br>
since the text file doesn't have a distingushable format for python, I need to hard code a RegEx for the extraction of data

In [40]:
file_path = 'Hamshahri-Corpus.txt'

In [3]:
def parse_text_blocks(text):
    # Define the regex pattern to match each block of text
    pattern = r'\.DID\s+(.*?)\n\.Date\s+(.*?)\n\.Cat\s+(.*?)\n(.*?)\n\n\n'

    # Find all matches of the pattern in the text
    matches = re.findall(pattern, text, re.DOTALL)

    # Create a dictionary to store parsed data
    parsed_data = {'DID': [], 'Date': [], 'Cat': [], 'Content': []}

    # Iterate over matches and populate the dictionary
    for match in matches:
        did, date, cat, content = match
        parsed_data['DID'].append(did.strip())
        parsed_data['Date'].append(date.strip())
        parsed_data['Cat'].append(cat.strip())
        parsed_data['Content'].append(content.strip())

    return parsed_data

# Define a string containing all printable ASCII characters
printable_chars = set(string.printable + 'ئآابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیيكءئ')

def clean_string(s):
    # Replace any character not in the printable ASCII range with a space
    return ''.join(c if c in printable_chars else ' ' for c in s)

def export_to_excel(parsed_data, file_name):
    # Create a DataFrame from parsed data
    df = pd.DataFrame(parsed_data)

    # Remove any illegal characters from the DataFrame
    df = df.applymap(clean_string)

    try:
        # Export DataFrame to Excel
        df.to_excel(file_name, index=False)
        print(f"Data successfully exported to {file_name}")
    except Exception as e:
        print(f"An error occurred while exporting data to {file_name}: {e}")


The file it too large to extract all the data at once. so I split it into 11 parts and extract data of each part separately. It also makes debugging much easier

In [4]:
def split_file_into_chunks(file_path, num_chunks):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    total_length = len(text)
    chunk_size = total_length // num_chunks

    # Split the text into chunks
    chunks = [text[i:i+chunk_size] for i in range(0, total_length, chunk_size)]

    return chunks
# Split the main file into chunks
num_chunks = 10
file_chunks = split_file_into_chunks(file_path, num_chunks)


- parse_text_blocks will find the specified format in each chunk and add it to the dictionary. 
- export_to_excel will check if all characters are usable in Excel (to avoid IllegalCharacterError) then export each chunk into a separate excel file

In [5]:
for i, chunk in enumerate(file_chunks):
    # Parse the data from the chunk
    parsed_data = parse_text_blocks(chunk)

    # Export parsed data to Excel
    excel_file_name = f'parsed_data_part_{i}.xlsx'  # Adjust file naming if needed
    export_to_excel(parsed_data, excel_file_name)

  df = df.applymap(clean_string)


Data successfully exported to parsed_data_part_0.xlsx
Data successfully exported to parsed_data_part_1.xlsx
Data successfully exported to parsed_data_part_2.xlsx
Data successfully exported to parsed_data_part_3.xlsx
Data successfully exported to parsed_data_part_4.xlsx
Data successfully exported to parsed_data_part_5.xlsx
Data successfully exported to parsed_data_part_6.xlsx
Data successfully exported to parsed_data_part_7.xlsx
Data successfully exported to parsed_data_part_8.xlsx
Data successfully exported to parsed_data_part_9.xlsx
Data successfully exported to parsed_data_part_10.xlsx


### It's time to read all the excel files and concatenate them into one dataframe

In [122]:
dfList = [pd.read_excel(f'parsed_data_part_{i}.xlsx') for i in range(0,11)]

In [123]:
df = pd.concat(dfList, axis=0)
df

Unnamed: 0,DID,Date,Cat,Content
0,1S1,75\04\02,adabh,جاودانگي در زندگي گروهي از طريق هنر \nنگاهي به...
1,2S1,75\04\02,adabh,رويدادهاي هنري جهان \nنمايشگاه هنر در خدمت ديك...
2,3S1,75\04\02,adabh,برديوار نگارخانه ها \nگالري گلستان: \nنمايشگاه...
3,4S1,75\04\02,ejtem,بازي را جدي بگيريم \nمطالعه اي مقدماتي پيرامون...
4,5S1,75\04\02,elmfa,تخته سياه و غباري كه سترده نمي شود... \nاشاره;...
...,...,...,...,...
10686,60055S1,81\11\20,vrzsh,گره هاي كور كشتي باز مي شود\nگروه ورزشي: با ح...
10687,60055S2,81\11\20,vrzsh,نماينده فدراسيون جهاني واليبال \n از ايران هر ...
10688,60055S3,81\11\20,vrzsh,شكست نامداران تكواندودر پيكارهاي برتر ليگ \nگر...
10689,60055S4,81\11\20,vrzsh,ورزشگاه بزرگ دانشگاه آزاد در تهران \nساخته مي ...


In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 165215 entries, 0 to 10690
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   DID      165215 non-null  object
 1   Date     165215 non-null  object
 2   Cat      165215 non-null  object
 3   Content  165213 non-null  object
dtypes: object(4)
memory usage: 6.3+ MB


## There are several things that we have to work on:
- check the encoding method 
- eliminating "\n"s from the text
- appropriate type casting


check the encoding

In [125]:
# Try to decode using different encodings
encodings_to_try = [ 'utf-8','latin-1', 'utf-16', 'ascii']  # Add other encodings as needed
for encoding in encodings_to_try:
    try:
        decoded_content = df['Content'].apply(lambda x: x if isinstance(x, str) else str(x))
        decoded_content.apply(lambda x: x.encode(encoding).decode(encoding))
        print(f"Decoding successful with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to decode with encoding: {encoding}")


Decoding successful with encoding: utf-8


remove "\n"s

In [126]:
# Remove the "\n" characters from the text column
df['Content'] = df['Content'].str.replace('\n', '')

replace '\\' with '-'

In [127]:
df['Date'] = df['Date'].str.replace('\\', '-')

type casting

In [128]:
df["DID"] = df["DID"].astype("str")
df['Cat'] = df['Cat'].astype('category')
df["Content"] = df["Content"].astype("str")

## Text Preprocessing

Removing the punctuations

In [131]:
# Function to remove punctuation from text
def remove_punctuation(text):
    # Define a translation table with all punctuation characters mapped to None
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation using the translation table
    return text.translate(translator)

# Apply the function to the "Content" column
df['Content'] = df['Content'].apply(remove_punctuation)
df

Unnamed: 0,DID,Date,Cat,Content
0,1S1,75-04-02,adabh,جاودانگي در زندگي گروهي از طريق هنر نگاهي به ن...
1,2S1,75-04-02,adabh,رويدادهاي هنري جهان نمايشگاه هنر در خدمت ديكتا...
2,3S1,75-04-02,adabh,برديوار نگارخانه ها گالري گلستان نمايشگاه طرح ...
3,4S1,75-04-02,ejtem,بازي را جدي بگيريم مطالعه اي مقدماتي پيرامون ن...
4,5S1,75-04-02,elmfa,تخته سياه و غباري كه سترده نمي شود اشاره به رغ...
...,...,...,...,...
10686,60055S1,81-11-20,vrzsh,گره هاي كور كشتي باز مي شودگروه ورزشي با حضور...
10687,60055S2,81-11-20,vrzsh,نماينده فدراسيون جهاني واليبال از ايران هر نظ...
10688,60055S3,81-11-20,vrzsh,شكست نامداران تكواندودر پيكارهاي برتر ليگ گروه...
10689,60055S4,81-11-20,vrzsh,ورزشگاه بزرگ دانشگاه آزاد در تهران ساخته مي شو...


Removing the numbers

In [132]:
# Function to remove numbers from text using regular expressions
def remove_numbers(text):
    # Use regular expression to remove all numbers
    return re.sub(r'\d+', '', text)

# Apply the function to the "Content" column
df['Content'] = df['Content'].apply(remove_numbers)

# Display the DataFrame
df

Unnamed: 0,DID,Date,Cat,Content
0,1S1,75-04-02,adabh,جاودانگي در زندگي گروهي از طريق هنر نگاهي به ن...
1,2S1,75-04-02,adabh,رويدادهاي هنري جهان نمايشگاه هنر در خدمت ديكتا...
2,3S1,75-04-02,adabh,برديوار نگارخانه ها گالري گلستان نمايشگاه طرح ...
3,4S1,75-04-02,ejtem,بازي را جدي بگيريم مطالعه اي مقدماتي پيرامون ن...
4,5S1,75-04-02,elmfa,تخته سياه و غباري كه سترده نمي شود اشاره به رغ...
...,...,...,...,...
10686,60055S1,81-11-20,vrzsh,گره هاي كور كشتي باز مي شودگروه ورزشي با حضور...
10687,60055S2,81-11-20,vrzsh,نماينده فدراسيون جهاني واليبال از ايران هر نظ...
10688,60055S3,81-11-20,vrzsh,شكست نامداران تكواندودر پيكارهاي برتر ليگ گروه...
10689,60055S4,81-11-20,vrzsh,ورزشگاه بزرگ دانشگاه آزاد در تهران ساخته مي شو...


Tokenizing the text

In [133]:
tokenizer = WordTokenizer()
df['Tokenized_Content'] = df['Content'].apply(lambda text: tokenizer.tokenize(text))

Removing stop words 

In [134]:
# Read stop words from the text file
with open("PersianStopWords.txt", "r", encoding="utf-8") as file:
    stop_words = set(file.read().splitlines())

# Define a function to remove stop words
def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

# Apply the function to the "Tokenized_Content" column
df['Tokenized_Content'] = df['Tokenized_Content'].apply(remove_stop_words)

## Five most frequently used tokens

In [137]:
# Flatten the list of tokens
all_tokens = [word for tokens in df['Tokenized_Content'] for word in tokens]

# Count the frequency of each word
word_counts = Counter(all_tokens)

# Get the five most common words
most_common_words = word_counts.most_common(5)

In [138]:
most_common_words

[('كشور', 229758),
 ('سال', 205930),
 ('ايران', 196647),
 ('تهران', 131980),
 ('اسلامي', 116233)]

Join all tokens into one string

In [139]:
df['Tokenized_Content'] = df['Tokenized_Content'].apply(lambda tokens: ' '.join(tokens))

Normalizing the preprocessed text

In [140]:
normalizer = Normalizer()
df['Tokenized_Content'] = df['Tokenized_Content'].apply(lambda text: normalizer.normalize(text))

Now We can safely replace the "Content" column

In [141]:
df['Content'] = df['Tokenized_Content']
# df = df.drop(['Tokenized_Content'], axis=1)

In [142]:
df.dropna(inplace=True)

## TF-IDF

In [143]:
# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the 'Content' column of the DataFrame
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])

# Convert to DataFrame (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [144]:
tfidf_df

Unnamed: 0,آب,آباد,آتش,آثار,آخرین,آذربایجان,آزاد,آزادی,آسیا,آغاز,...,گل,گوید,گویند,گیر,گیرد,گیرند,گیری,یاد,یافت,یافته
0,0.0,0.0,0.0,0.258212,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.053661,0.000000,0.067469,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.0,0.413254,0.000000,0.0,0.000000,0.0,0.000000,0.040819,...,0.0,0.000000,0.000000,0.033744,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2,0.0,0.0,0.0,0.141249,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.000000,0.000000,0.0,0.043216,0.0,0.000000,0.032650,...,0.0,0.000000,0.016168,0.035987,0.022619,0.000000,0.0,0.0,0.013001,0.000000
4,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.026711,...,0.0,0.000000,0.000000,0.022081,0.055515,0.038506,0.0,0.0,0.000000,0.030944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165210,0.0,0.0,0.0,0.000000,0.031216,0.0,0.000000,0.0,0.000000,0.024451,...,0.0,0.032152,0.000000,0.020213,0.025409,0.000000,0.0,0.0,0.000000,0.000000
165211,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.068063,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
165212,0.0,0.0,0.0,0.000000,0.142440,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.133282,0.000000
165213,0.0,0.0,0.0,0.000000,0.000000,0.0,0.428700,0.0,0.000000,0.064777,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [145]:
top_terms_per_document = {}
for i, row in tfidf_df.iterrows():
    top_terms = row.sort_values(ascending=False).head(5).index.tolist()  
    top_terms_per_document[i] = top_terms

# Print the top terms for each document
for document, top_terms in top_terms_per_document.items():
    print(f"Document {document}: {', '.join(top_terms)}")

Document 0: نقاشی, زندگی, آثار, موضوع, گروهی
Document 1: نمایشگاه, آثار, نمایش, خانه, هنری
Document 2: نقاشی, نمایشگاه, ساعت, تلفن, شماره
Document 3: بازی, بچه, خانه, کودکان, زندگی
Document 4: آموزشی, مدارس, آموزش, آموزان, دانش
Document 5: آذربایجان, جمهوری, مجلس, ملی, خواند
Document 6: دانش, آموزان, معرفی, نهایی, صرف
Document 7: دانشگاه, دانشجویان, علمی, اعتراض, صدور
Document 8: سبز, نیست, نیستند, هست, سوال
Document 9: دانش, آموزان, مختلف, نقاط, آموزش
Document 10: رشد, اقتصادی, سال, درصد, آینده
Document 11: مدیران, دوره, مدیریت, توسعه, صنعتی
Document 12: صنایع, رفع, وزیر, سرمایه, تسهیلات
Document 13: آب, مصرف, هفته, تعیین, آغاز
Document 14: تجاری, ژاپن, ماه, کاهش, درصد
Document 15: تلویزیون, تولید, دستگاه, هزار, اختصاص
Document 16: تامین, تن, کشور, وارد, درصد
Document 17: شرکت, سهام, افزایش, ریال, هفته
Document 18: جهان, افزایش, بانک, دلار, میلیارد
Document 19: اروپا, اجلاس, انگلیس, اتحادیه, اروپایی
Document 20: سرمایه, طرح, خارجی, صنعتی, گذاری
Document 21: دانش, آموزان, گوید, پر, میل

## Top five important terms used in the whole dataset

In [146]:
# Sum the TF-IDF values for each term across all documents
total_tfidf = tfidf_df.sum()

# Sort the terms based on their total TF-IDF values
top_terms = total_tfidf.sort_values(ascending=False).head(5)

# Print the top terms
print("Top five terms used in all documents:")
for term, tfidf in top_terms.items():
    print(f"{term}: Total TF-IDF = {tfidf}")

Top five terms used in all documents:
ایران: Total TF-IDF = 6274.548198367028
کشور: Total TF-IDF = 5875.753677587329
تهران: Total TF-IDF = 5288.305372489105
سال: Total TF-IDF = 5077.228962087621
تیم: Total TF-IDF = 4614.837217279096
