In [1]:
import pandas as pd
import numpy as np

from analyze_src.basic_data_inspection import DataInspector, DataTypesInspectionStrategy, BasicInfoInspectionStrategy, RandomSamplesInspectionStrategy, TextCountStatsInspectionStrategy, CountStatsInspectionStrategy

In [2]:
data_path = '../data/Depression_Severity_Levels_Dataset.csv'
df = pd.read_csv(data_path)

In [3]:
# Step 1: Basic Data Inspection
# ------------------------------------
# Initialize the Data Inspector with a strategy for Data Types Inspection
data_inspector = DataInspector(DataTypesInspectionStrategy())
data_inspector.execute_inspection(df)


Data Types and Non-null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41873 entries, 0 to 41872
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    41859 non-null  object
 1   label   41873 non-null  object
dtypes: object(2)
memory usage: 654.4+ KB
None


In [4]:
data_inspector.set_strategy(BasicInfoInspectionStrategy())
data_inspector.execute_inspection(df)

Shape: (41873, 2)
Null values:
 text     14
label     0
dtype: int64
Duplicate rows: 6406


In [5]:
data_inspector.set_strategy(RandomSamplesInspectionStrategy())
data_inspector.execute_inspection(df)


Random Sample:
                                                     text     label
15529  I am struggling to find out what to do Either ...      mild
12248  My little brother may have cancer and I think ...    severe
17680  I am 16 for as long as I can remember I have b...      mild
7514   Why can I never win? : I feel like I can never...  moderate
15310  Does anyone every feel like laughing at memes ...      mild


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HOME\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
data_inspector.set_strategy(TextCountStatsInspectionStrategy())
data_inspector.execute_inspection(df)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HOME\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Character count stats:
 count    41873.000000
mean       620.619803
std        876.233440
min          1.000000
25%        121.000000
50%        360.000000
75%        768.000000
max      27390.000000
Name: char_count, dtype: float64

Word count stats:
 count    41873.000000
mean       122.008860
std        169.185182
min          1.000000
25%         23.000000
50%         71.000000
75%        152.000000
max       5248.000000
Name: word_count, dtype: float64

Sentence count stats:
 count    41873.000000
mean         7.242638
std          9.910515
min          1.000000
25%          2.000000
50%          5.000000
75%          9.000000
max        300.000000
Name: sentence_count, dtype: float64


In [8]:
print(df)

                                                    text     label  \
0      He said he had not felt that way before, sugge...      mild   
1      Hey there r/assistance, Not sure if this is th...   minimum   
2      My mom then hit me with the newspaper and it s...   minimum   
3      until i met my new boyfriend, he is amazing, h...      mild   
4      October is Domestic Violence Awareness Month a...  moderate   
...                                                  ...       ...   
41868  You cannot fix your children and you cannot pl...   minimum   
41869  Awakenings App - NEW CONTENT now available!   ...   minimum   
41870  Spend less time worrying about what others thi...   minimum   
41871  Trust is the belief that you can get through a...   minimum   
41872  It is not your duty to change the minds of oth...   minimum   

       char_count  word_count  sentence_count  
0             571         113               4  
1             588         108               3  
2             8

In [9]:
data_inspector.set_strategy(CountStatsInspectionStrategy())
data_inspector.execute_inspection(df)


Mode (character count): 37

Median (character count): 360.0

Mean (character count): 620.6198027368471

Mode (word count): 7

Median (word count): 71.0

Mean (word count): 122.00886012466268

Mode (Sentence count): 1

Median (Sentence count): 5.0

Mean (Sentence count): 7.242638454373941
