In [None]:
import os
import pandas as pd
import regex as re
import textwrap

cwd = os.getcwd()
parent_path = os.path.dirname(cwd)
print(parent_path)

## Basics on text as data

### Splitting strings


In [None]:
my_sentence = "Hello, how are you? I'm ok thanks. How are you?"

list_of_words = my_sentence.split()
print(list_of_words)
re.findall(r'\b[A-Z][a-z]+\b', my_sentence)

### Displaying text

In [None]:
df_text = pd.read_csv(parent_path + '/data/text_judiciary_weakened.csv')
print(df_text.shape)
df_text.head()

You can see that the text is hard to read in this format. Also notice that the column "title" that we are interested in contains missing values. 
- Decide whether to remove rows with missing values or whether to fill them with a string.
- Note that you have to comment out one option!

In [None]:
df_text.info()

In [19]:
# Option 1: make sure titles are strings and NaNs become empty strings
#df_text['title'] = df_text['title'].fillna('').astype(str)

# Option 2: 
df_text = df_text.dropna(axis=0)

In [None]:
# Function to wrap text of a specific cell
def wrap_text(text, width=50):
    """Wrap text to the specified width."""
    return textwrap.fill(text, width)

# Maximum width for line breaks
max_width = 130

# Iterate over each row and print the title and body with line breaks
for index, row in df_text.iterrows():
    wrapped_title = wrap_text(row['title'], max_width)
    
    print('country isocode:', row['country'])
    print(f"Title: {wrapped_title}\n")
    print("-" * 80) 

### Features based on regex searches

In [None]:
df_text['keywords'] = df_text['title'].str.findall(r'\b[A-Z][a-z]+\b')

df_text['contains_judge'] = df_text['title'].str.contains(r'\b(Judge|Judges)\b', regex=True)

df_text['bigrams'] = df_text['title'].str.findall(r'\b\w+\b \b\w+\b')

df_text['word_count_1'] = df_text['title'].str.count(r'\b\w+\b')

df_text['word_count_2'] = df_text['title'].str.split().str.len()

df_text.head(3)
