In [1]:
import pandas as pd
import unicodedata
import re

# Step 1: Remove diacritics
def remove_diacritics(text):
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

# Step 2: Standardize text
def standardize_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading/trailing spaces
    return text.lower()  # Convert to lowercase

# Combine the two functions
def clean_text(text):
    text = remove_diacritics(text)
    text = standardize_text(text)
    return text

In [2]:
#Try with real data

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load all train-X.csv files
#train_files = [f"../data/train-{i}.csv" for i in range(1, 9)]
train_files = [f"../data/train-1.csv"]
df_list = [pd.read_csv(file) for file in train_files]
df = pd.concat(df_list, ignore_index=True)

In [3]:
#Clean primary title
# Apply the cleaning function to the 'primaryTitle' column
df['primaryTitle'] = df['primaryTitle'].apply(clean_text)

In [4]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,the doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,way down east,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,destiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,the navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,the phantom of the opera,The Phantom of the Opera,1925,\N,93,17887.0,True
5,42,tt0016630,battling butler,Battling Butler,1926,\N,77,3285.0,True
6,81,tt0021015,juno and the paycock,,1929,\N,85,2275.0,False
7,118,tt0023973,the eagle and the hawk,,1933,\N,73,,True
8,119,tt0023986,employees' entrance,,1933,\N,75,,True
9,123,tt0024184,the invisible man,The Invisible Man,1933,\N,71,33562.0,True


In [5]:
#Utilize LLM for missing original title imputation
from transformers import pipeline

# Load your local LLM (replace 'your-model' with your model name)
llm = pipeline('text-generation', model='distilgpt2')

# Extract rows where 'originalTitle' is missing
missing_indices = df['originalTitle'].isnull()
missing_titles = df.loc[missing_indices, 'primaryTitle']

Device set to use cpu


In [6]:
# Function to generate missing titles
def generate_original_title(primary_title):
    prompt = f"from this movie, {primary_title} generate its original title"
    generated_text = llm(prompt, max_length=50)[0]['generated_text']
    return generated_text.strip()

In [7]:
# Generate titles only for missing rows
df.loc[missing_indices, 'originalTitle'] = missing_titles.apply(generate_original_title)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

In [8]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,the doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,way down east,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,destiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,the navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,the phantom of the opera,The Phantom of the Opera,1925,\N,93,17887.0,True
5,42,tt0016630,battling butler,Battling Butler,1926,\N,77,3285.0,True
6,81,tt0021015,juno and the paycock,"from this movie, juno and the paycock generate...",1929,\N,85,2275.0,False
7,118,tt0023973,the eagle and the hawk,"from this movie, the eagle and the hawk genera...",1933,\N,73,,True
8,119,tt0023986,employees' entrance,"from this movie, employees' entrance generate ...",1933,\N,75,,True
9,123,tt0024184,the invisible man,The Invisible Man,1933,\N,71,33562.0,True


In [10]:
row = df.iloc[11]
print(row)


Unnamed: 0                                                      135
tconst                                                    tt0025028
primaryTitle                                                  dames
originalTitle     from this movie, dames generate its original t...
startYear                                                      1934
endYear                                                          \N
runtimeMinutes                                                   91
numVotes                                                     2038.0
label                                                          True
Name: 11, dtype: object
