In [1]:
import lester as lt 
from transformers import pipeline 
from dateutil import parser

target_countries = ['UK', 'DE', 'FR'] 
sentiment_predictor = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def matches_usecase(text): 
    return "complaint" in text or "bank account" in text

def sanitize(text): 
    return text.lower()

def _lester_dataprep(): 
    # Load customer data 
    customer_df = lt.read_csv("data/customers.csv", header=None, names=['customer_id', 'customer_email', 'bank', 'country', 'level']) 
    customer_df = customer_df.filter('country in @target_countries') 
    customer_df = customer_df.project(target_column='is_premium', source_columns=['level'], func=lambda row: row['level'] == 'premium')

    # Select relevant columns for merging
    customer_df = customer_df[['customer_email', 'bank', 'country', 'is_premium']]
    
    # Load mails data
    mails_df = lt.read_csv("data/mails.csv", header=None, names=['mail_id', 'email', 'raw_date', 'mail_subject', 'mail_text'])
    mails_df = mails_df.project(target_column='mail_date', source_columns=['raw_date'], func=lambda row: parser.parse(row['raw_date']))
    mails_df = mails_df.filter('mail_date.dt.year >= 2022')
    mails_df = mails_df.filter('mail_text.apply(@matches_usecase)')
    
    # Merge dataframes
    merged_df = lt.join(mails_df, customer_df, left_on='email', right_on='customer_email')
    
    # Process and assign new columns
    merged_df = merged_df.project(target_column='title', source_columns=['mail_subject'], func=lambda row: sanitize(row['mail_subject']))
    merged_df = merged_df.project(target_column='text', source_columns=['mail_text'], func=lambda row: sanitize(row['mail_text']))
    merged_df = merged_df.project(target_column='sentiment', source_columns=['mail_text'], func=lambda row: sentiment_predictor(row['mail_text'])[0]['label'].lower())
    
    # Select the required columns
    result_df = merged_df[['title', 'text', 'bank', 'country', 'is_premium']]
    
    return result_df

lt.make_accessible(locals(), globals())
result_df = _lester_dataprep()

In [3]:
result_df.column_provenance

{'bank': ['0xddc14d49.bank'],
 'country': ['0xddc14d49.country'],
 'is_premium': ['0xddc14d49.level'],
 'title': ['0x46e01896.mail_subject'],
 'text': ['0x46e01896.mail_text']}

In [5]:
result_df.df

Unnamed: 0,title,text,bank,country,is_premium,__lester_provenance_0x46e01896,__lester_provenance_0xddc14d49
0,hello,bank account awesome,Deutsche Bank,FR,False,4,3
1,hello,bank account denied,Deutsche Bank,DE,True,6,2
2,hello,complaint,Deutsche Bank,FR,False,7,3
3,hello,no complaint only praise,Deutsche Bank,DE,True,8,2


import lester as lt
from transformers import pipeline
from dateutil import parser

target_countries = ['UK', 'DE', 'FR']
sentiment_predictor = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def matches_usecase(text):
    return "complaint" in text or "bank account" in text 

def sanitize(text):
    return text.lower()

def _lester_dataprep():
    # Load customer data
    customer_df = lt.read_csv("data/customers.csv", header=None, names=['customer_id', 'customer_email', 'bank', 'country', 'level'])
    customer_df = customer_df.filter('country in @target_countries')
    customer_df = customer_df.project(target_column='is_premium', source_columns=['level'], func=lambda row: row['level'] == 'premium')
    
    # Select relevant columns for merging
    customer_df = customer_df[['customer_email', 'bank', 'country', 'is_premium']]
    
    # Load mails data
    mails_df = lt.read_csv("data/mails.csv", header=None, names=['mail_id', 'email', 'raw_date', 'mail_subject', 'mail_text'])
    mails_df = mails_df.project(target_column='mail_date', source_columns=['raw_date'], func=lambda row: parser.parse(row['raw_date']))
    mails_df = mails_df.filter('mail_date.dt.year >= 2022')
    mails_df = mails_df.filter('mail_text.apply(@matches_usecase)')
    
    # Merge dataframes
    merged_df = lt.join(mails_df, customer_df, left_on='email', right_on='customer_email')
    
    # Process and assign new columns
    merged_df = merged_df.project(target_column='title', source_columns=['mail_subject'], func=lambda row: sanitize(row['mail_subject']))
    merged_df = merged_df.project(target_column='text', source_columns=['mail_text'], func=lambda row: sanitize(row['mail_text']))
    merged_df = merged_df.project(target_column='sentiment', source_columns=['mail_text'], func=lambda row: sentiment_predictor(row['mail_text'])[0]['label'].lower())
    
    # Select the required columns
    result_df = merged_df[['title', 'text', 'bank', 'country', 'is_premium']]
    
    return result_df

result_df = _lester_dataprep()





