In [1]:
import os
from dateutil import parser
from transformers import pipeline
os.environ["TOKENIZERS_PARALLELISM"] = "False"


target_countries = ['UK', 'DE', 'FR']
customer_data = {}

sentiment_predictor = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def matches_usecase(text):
    return "complaint" in text or "bank account" in text 

def sanitize(text):
    return text.lower()

with open("./test.csv", 'w') as output_file:
    with open("data/customers.csv") as file:
        for line in file:
            parts = line.strip().split(',')
            customer_id, customer_email, bank, country, level = parts
            is_premium = (level == 'premium')
            if country in target_countries:
                customer_data[customer_email] = (bank, country, is_premium)
    
    with open("data/mails.csv") as file:
        for line in file:
            parts = line.strip().split(",")
            mail_id, email, raw_date, mail_subject, mail_text = parts
            mail_date = parser.parse(raw_date)
            if mail_date.year >= 2022 and matches_usecase(mail_text):
                if email in customer_data:                
                    bank, country, is_premium = customer_data[email]					
                    title = sanitize(mail_subject)
                    text = sanitize(mail_text)
                    sentiment = sentiment_predictor(mail_text)[0]['label'].lower()
                    output_file.write(f"{title}\t{text}\t{bank}\t{country}\t{sentiment}\t{is_premium}\n")

In [2]:
local_vars = locals()
#local_vars['target_countries']
local_vars['matches_usecase']

<function __main__.matches_usecase(text)>

In [3]:
!cat ./test.csv

hello	bank account awesome	Deutsche Bank	FR	positive	False
hello	bank account denied	Deutsche Bank	DE	negative	True
hello	complaint	Deutsche Bank	FR	negative	False
hello	no complaint only praise	Deutsche Bank	DE	positive	True


The following code is written in python with for loops and manual data parsing. Please rewrite the code to use a dataframe library called lester. lester has an API similar to pandas and supports the following operations from pandas: 'merge', 'query', 'assign', 'explode', 'rename'. The 'assign' method in lester has two additional parameters: `target_column` and `source_columns`; `target_column` refers to the new column which should be created, while `source_columns` refers to the list of input columns that are used by the expression in `assign`. Please create a single, separate `assign` statement for each new column that is computed. Only respond with python code. Do not iterate over dataframes. The code should contain a single function called `_lester_dataprep`, which returns a single dataframe called `result_df` as result. This final dataframe should have the following columns: title, text, bank, country, is_premium


In [4]:
import lester as lt
from transformers import pipeline
from dateutil import parser

target_countries = ['UK', 'DE', 'FR']
sentiment_predictor = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def matches_usecase(text):
    return "complaint" in text or "bank account" in text 

def sanitize(text):
    return text.lower()

def _lester_dataprep():
    # Load customer data
    customer_df = lt.read_csv("data/customers.csv", header=None, names=['customer_id', 'customer_email', 'bank', 'country', 'level'])
    customer_df = customer_df.query('country in @target_countries')
    customer_df = customer_df.assign(target_column='is_premium', source_columns=['level'], func=lambda row: row['level'] == 'premium')
    
    # Select relevant columns for merging
    customer_df = customer_df[['customer_email', 'bank', 'country', 'is_premium']]
    
    # Load mails data
    mails_df = lt.read_csv("data/mails.csv", header=None, names=['mail_id', 'email', 'raw_date', 'mail_subject', 'mail_text'])
    mails_df = mails_df.assign(target_column='mail_date', source_columns=['raw_date'], func=lambda row: parser.parse(row['raw_date']))
    mails_df = mails_df.query('mail_date.dt.year >= 2022')
    mails_df = mails_df.query('mail_text.apply(matches_usecase)')
    
    # Merge dataframes
    merged_df = lt.merge(mails_df, customer_df, left_on='email', right_on='customer_email')
    
    # Process and assign new columns
    merged_df = merged_df.assign(target_column='title', source_columns=['mail_subject'], func=lambda row: sanitize(row['mail_subject']))
    merged_df = merged_df.assign(target_column='text', source_columns=['mail_text'], func=lambda row: sanitize(row['mail_text']))
    merged_df = merged_df.assign(target_column='sentiment', source_columns=['mail_text'], func=lambda row: sentiment_predictor(row['mail_text'])[0]['label'].lower())
    
    # Select the required columns
    result_df = merged_df[['title', 'text', 'bank', 'country', 'is_premium']]
    
    return result_df

result_df = _lester_dataprep()

AttributeError: 'TrackedDataframe' object has no attribute 'query'

In the following Python code, please make sure that all local variables referenced in the `query` function start with '@':

In the following Python code, please rename the function `assign` to `project`, the function `merge` to `join` and the function `query` to `filter`.

In [None]:
import lester as lt
from transformers import pipeline
from dateutil import parser

target_countries = ['UK', 'DE', 'FR']
sentiment_predictor = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def matches_usecase(text):
    return "complaint" in text or "bank account" in text 

def sanitize(text):
    return text.lower()

def _lester_dataprep():
    # Load customer data
    customer_df = lt.read_csv("data/customers.csv", header=None, names=['customer_id', 'customer_email', 'bank', 'country', 'level'])
    customer_df = customer_df.filter('country in @target_countries')
    customer_df = customer_df.project(target_column='is_premium', source_columns=['level'], func=lambda row: row['level'] == 'premium')
    
    # Select relevant columns for merging
    customer_df = customer_df[['customer_email', 'bank', 'country', 'is_premium']]
    
    # Load mails data
    mails_df = lt.read_csv("data/mails.csv", header=None, names=['mail_id', 'email', 'raw_date', 'mail_subject', 'mail_text'])
    mails_df = mails_df.project(target_column='mail_date', source_columns=['raw_date'], func=lambda row: parser.parse(row['raw_date']))
    mails_df = mails_df.filter('mail_date.dt.year >= 2022')
    mails_df = mails_df.filter('mail_text.apply(matches_usecase)')
    
    # Merge dataframes
    merged_df = lt.join(mails_df, customer_df, left_on='email', right_on='customer_email')
    
    # Process and assign new columns
    merged_df = merged_df.project(target_column='title', source_columns=['mail_subject'], func=lambda row: sanitize(row['mail_subject']))
    merged_df = merged_df.project(target_column='text', source_columns=['mail_text'], func=lambda row: sanitize(row['mail_text']))
    merged_df = merged_df.project(target_column='sentiment', source_columns=['mail_text'], func=lambda row: sentiment_predictor(row['mail_text'])[0]['label'].lower())
    
    # Select the required columns
    result_df = merged_df[['title', 'text', 'bank', 'country', 'is_premium']]
    
    return result_df

result_df = _lester_dataprep()

In [None]:
In the 