In [15]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data_files/politeness.csv')

In [7]:
df['len'] = [len(it.split(' ')) for it in df['txt']]

In [18]:
for p in [25,50,75,90]:
    print(np.percentile(df['len'],p))

8.0
14.0
23.0
33.0


In [23]:
# sample data from a given dataframe
# df is the given dataframe, N is the amount of samples, strategy_dict defines buckets and sampling rate for each bucket
# sampling_key defines the column used for bucket sampling, random_state defines the random state of df.sample()
def sample_data(df, N, strategy_dict, sampling_key, random_state):
    sampled_df = pd.DataFrame()
    if strategy_dict:
        for key,ratio in strategy_dict.items():
            s_df = df[(df[sampling_key]>=key[0])&(df[sampling_key]<key[1])]
            #print(len(s_df))
            s_df = s_df.sample(min(int(N*ratio),len(s_df)), random_state=random_state)
            sampled_df = pd.concat([sampled_df, s_df])
    else:
        sampled_df = df.sample(min(N,len(df)), random_state=random_state)
    return sampled_df

def filter_text(text):
    spt = text.split(' ')
    if len(spt) < 10 or len(spt) > 33:
        return False
    if spt[0] == 're' or spt[0] == "forwarded":
        return False
    return True

In [24]:
df['selected'] = [filter_text(it) for it in df['txt']]

In [25]:
df.value_counts('selected')

selected
True     1477130
False    1049629
dtype: int64

In [26]:
s_df = df.sample(1000)

In [28]:
s_df['text'] = s_df['txt']
s_df['id'] = s_df['sent_id'].astype(str)

In [10]:
s_df['txt']

159201     [ image]= bush with default orpheum theatre tu...
711867                         < jwallbillich@glgt.com > , <
2271305    to the extent all of our deal costs ( includin...
931833     there is also a 10 % discount for one addition...
2221369             i asked him if you had told him anything
                                 ...                        
1421919        thanks , kate kimberly hundl@enron 11/27/2000
468833     separately , the firm also moved today to bloc...
1046411           the co. # is 0011 and the rc # is 100038 .
725448     subject : llgm meetings meeting confirmation :...
1697052    need to identify the overall goal and time - f...
Name: txt, Length: 1000, dtype: object

In [20]:
# Output in JSONL format into a file 
f=open('data_files/politeness_100.jsonl','w')
print(s_df.to_json(orient='records', lines=True),file=f, flush=False)

In [29]:
s_df.drop(columns=['txt','sent_id']).to_csv('data_files/politeness_1000.csv',index=False)