In [1]:
import pandas as pd
import sys
import os

# For importing custom modules
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

### DATA LOADING

In [2]:
from src.data_preprocessing.loader import DataLoader
from src.data_preprocessing.cleaner import DataCleaner
from src.data_preprocessing.transformer import DataTransformer

DATA_FILEPATH = "../data/BiztelAI_DS_Dataset_V1.json"

loader = DataLoader(file_path=DATA_FILEPATH)
raw_json_data = loader.load_json_dataset()
df_raw = loader.structure_raw_dataset(raw_json_data)
print("\nInitial dataframe shape:", df_raw.shape)
df_raw.head()

Loaded JSON dataset from ../data/BiztelAI_DS_Dataset_V1.json
Raw dataset converted into DataFrame.

Initial dataframe shape: (11760, 10)


Unnamed: 0,transcript_id,article_url,config,message,agent,sentiment,knowledge_source,turn_rating,conversation_rating_agent1,conversation_rating_agent2
0,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,Did you know that the University of Iowa's loc...,agent_1,Curious to dive deeper,"(FS1,)",Good,Good,Good
1,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,I think I did hear something about that. I im...,agent_2,Neutral,"(FS1,)",Good,Good,Good
2,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,"So, it would be in the visiting team's locker ...",agent_1,Curious to dive deeper,"(FS1,)",Good,Good,Good
3,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,Right. Teams do all kinds of things to bother...,agent_2,Neutral,"(FS1,)",Good,Good,Good
4,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,"I would hate a cold bench. Then again, I would...",agent_1,Neutral,"(Personal Knowledge,)",Good,Good,Good


### DATA CLEANING

In [3]:
cleaner = DataCleaner()
df_cleaned = cleaner.clean_data(df_raw.copy())
print("\nCleaned Dataframe shape: ",df_cleaned.shape)
df_cleaned.head()

Handling missing values...
Column transcript_id has no missing values
Column article_url has no missing values
Column config has no missing values
Column message has no missing values
Column agent has no missing values
Column sentiment has no missing values
Column knowledge_source has no missing values
Column turn_rating has no missing values
Column conversation_rating_agent1 has no missing values
Column conversation_rating_agent2 has no missing values
Handling duplicate values...
Initial rows: 11760

Rows after dropping exact duplicates: 11760
Correcting data types...
Data cleaning process completed.

Cleaned Dataframe shape:  (11760, 10)


Unnamed: 0,transcript_id,article_url,config,message,agent,sentiment,knowledge_source,turn_rating,conversation_rating_agent1,conversation_rating_agent2
0,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,Did you know that the University of Iowa's loc...,agent_1,Curious to dive deeper,"(FS1,)",Good,Good,Good
1,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,I think I did hear something about that. I im...,agent_2,Neutral,"(FS1,)",Good,Good,Good
2,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,"So, it would be in the visiting team's locker ...",agent_1,Curious to dive deeper,"(FS1,)",Good,Good,Good
3,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,Right. Teams do all kinds of things to bother...,agent_2,Neutral,"(FS1,)",Good,Good,Good
4,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,"I would hate a cold bench. Then again, I would...",agent_1,Neutral,"(Personal Knowledge,)",Good,Good,Good


In [4]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11760 entries, 0 to 11759
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   transcript_id               11760 non-null  object  
 1   article_url                 11760 non-null  object  
 2   config                      11760 non-null  category
 3   message                     11760 non-null  object  
 4   agent                       11760 non-null  category
 5   sentiment                   11760 non-null  category
 6   knowledge_source            11760 non-null  object  
 7   turn_rating                 11760 non-null  category
 8   conversation_rating_agent1  11760 non-null  object  
 9   conversation_rating_agent2  11760 non-null  object  
dtypes: category(4), object(6)
memory usage: 598.2+ KB


### DATA TRANSFORMATION

In [5]:
transformer = DataTransformer()
df_processed = transformer.transform_data(df_cleaned.copy())
df_processed.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\haadi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\haadi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haadi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haadi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Applying text preprocessing to column 'message'...
Encoded column agent. Unique values: 2
Encoded column sentiment. Unique values: 8
Encoded column turn_rating. Unique values: 6
Encoded column config. Unique values: 4
Encoding multi-label column 'knowledge_source'...
Encoded 'knowledge_source' into 8 binary columns.
Data transformation complete.


Unnamed: 0,transcript_id,article_url,config,message,agent,sentiment,knowledge_source,turn_rating,conversation_rating_agent1,conversation_rating_agent2,...,turn_rating_encoded,config_encoded,ks_AS1,ks_AS2,ks_AS3,ks_AS4,ks_FS1,ks_FS2,ks_FS3,ks_Personal Knowledge
0,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,Did you know that the University of Iowa's loc...,agent_1,Curious to dive deeper,"(FS1,)",Good,Good,Good,...,2,2,0,0,0,0,1,0,0,0
1,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,I think I did hear something about that. I im...,agent_2,Neutral,"(FS1,)",Good,Good,Good,...,2,2,0,0,0,0,1,0,0,0
2,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,"So, it would be in the visiting team's locker ...",agent_1,Curious to dive deeper,"(FS1,)",Good,Good,Good,...,2,2,0,0,0,0,1,0,0,0
3,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,Right. Teams do all kinds of things to bother...,agent_2,Neutral,"(FS1,)",Good,Good,Good,...,2,2,0,0,0,0,1,0,0,0
4,t_d004c097-424d-45d4-8f91-833d85c2da31,https://www.washingtonpost.com/sports/colleges...,C,"I would hate a cold bench. Then again, I would...",agent_1,Neutral,"(Personal Knowledge,)",Good,Good,Good,...,2,2,0,0,0,0,0,0,0,1


In [6]:
print("Sample of processed messages:")
df_processed[['message', 'message_processed']].head()

Sample of processed messages:


Unnamed: 0,message,message_processed
0,Did you know that the University of Iowa's loc...,know university iowa locker room painted pink ...
1,I think I did hear something about that. I im...,think hear something imagine attempt psych team
2,"So, it would be in the visiting team's locker ...",would visiting team locker room
3,Right. Teams do all kinds of things to bother...,right team kind thing bother competition heard...
4,"I would hate a cold bench. Then again, I would...",would hate cold bench would want place cold wa...


In [7]:
print("Label encoders used (example for 'agent'):")
if 'agent' in transformer.label_encoders_map:
    print(f"Agent mapping: {dict(zip(transformer.label_encoders_map['agent'].classes_, transformer.label_encoders_map['agent'].transform(transformer.label_encoders_map['agent'].classes_)))}")

Label encoders used (example for 'agent'):
Agent mapping: {'A': np.int64(0), 'B': np.int64(1), 'C': np.int64(2), 'D': np.int64(3)}


In [8]:
print("Knowledge source classes:", transformer.multi_label_binarizers.classes_)

Knowledge source classes: ['AS1' 'AS2' 'AS3' 'AS4' 'FS1' 'FS2' 'FS3' 'Personal Knowledge']


In [9]:
df_processed.to_csv('../data/processed_chat_data.csv', index=False)
df_processed.to_pickle('../data/processed_chat_data.pkl')