## Content
  - [Loading Data](#Loading-Data)
  - [Functions](#Functions)
    - [Count Rows Containing Substring](#Count-Rows-Containing-Substring)
    - [Find and Count Regex Pattern](#Find-and-Count-Regex-Pattern)
  - [Analysis](#Analysis)
    - [Count Rows Containing 'PROPER_NAME'](#Count-Rows-Containing-'PROPER_NAME')
    - [Count Rows Containing Words with All Capitals and Underscore](#Count-Rows-Containing-Words-with-All-Capitals-and-Underscore)
    - [Find and Count Distinct Words with All Capitals and Underscore](#Find-and-Count-Distinct-Words-with-All-Capitals-and-Underscore)


In [1]:
import pandas as pd
import re
from collections import Counter

df = pd.read_csv('train_split.csv')
# Define the regex pattern to find words with all capitals and underscore
pattern = r'\b[A-Z]+_[A-Z]+\b'

def count_rows_containing(df, column, substring):
    return df[column].str.contains(substring).sum()

def find_and_count_pattern(df, column, pattern):
    # Find all matches in the specified column
    matches = df[column].apply(lambda x: re.findall(pattern, x))
    
    # Flatten the list of matches and get distinct words
    flattened_matches = [item for sublist in matches for item in sublist]
    
    word_counts = Counter(flattened_matches)
    df_word_counts = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
    return df_word_counts

# Count rows containing 'PROPER_NAME'
count_proper_name = count_rows_containing(df, 'full_text', 'PROPER_NAME')
print(f"Number of rows containing 'PROPER_NAME': {count_proper_name}")

# Count rows containing words with all capitals and underscore
count_capital_underscore = count_rows_containing(df, 'full_text', pattern)
print(f"Number of rows containing words with all capitals and underscore: {count_capital_underscore}")

# Find and count distinct words with all capitals and underscore
df_word_counts = find_and_count_pattern(df, 'full_text', pattern)
print(df_word_counts)


Number of rows containing 'PROPER_NAME': 228
Number of rows containing words with all capitals and underscore: 248
              Word  Count
0      PROPER_NAME    252
1        OTHER_PII     28
2    LOCATION_NAME     12
3      SCHOOL_NAME     14
4       CITY_STATE      1
5   STREET_ADDRESS      2
6    EMAIL_ADDRESS      2
7     STUDENT_NAME      7
8     PHONE_NUMBER      2
9       STATE_NAME      1
10       TEST_NAME      1
11    GENERIC_NAME      1
