In [1]:
import pandas as pd
import os
import json
import re
import requests
from bs4 import BeautifulSoup

In [2]:
class DataManager():
    imdb_data_folder = 'imdb/data'

    def __init__(self) -> None:
        self.url = "https://www.compart.com/en/unicode"
        self.mapper = {}
        self.load_train_data()
        self.extract_special_chars()
        # self.load_json_data()
        # self.load_validation_data()
        # self.load_test_data()

    def load_train_data(self):
        path = f"{DataManager.imdb_data_folder}/csv/train" 
        training_data_files = os.listdir(path=path)

        train_df = pd.DataFrame()
        for file in training_data_files:
            if '0' in file:
                continue
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=True)
            
        self.train_df = train_df.rename(columns={"Unnamed: 0": 'index'}).sort_values(by='index').set_index('index').fillna('NULL')
        self.train_df.to_csv(f"{path}/train-0.csv", sep=';')
    
    
    def load_validation_data(self):
        path = f"{DataManager.imdb_data_folder}/csv/test_and_validation" 
        df = pd.read_csv(f"{path}/validation_hidden.csv")
        self.validation_df = df.rename(columns={"Unnamed: 0": 'index'}).sort_values(by='index').set_index('index')

    def load_test_data(self):
        path = f"{DataManager.imdb_data_folder}/csv/test_and_validation" 
        df = pd.read_csv(f"{path}/test_hidden.csv")
        self.test_df = df.rename(columns={"Unnamed: 0": 'index'}).sort_values(by='index').set_index('index')

    def load_json_data(self):
        path = f"{DataManager.imdb_data_folder}/json"
        
        self.directing_df = pd.read_json(f"{path}/directing.json")
        self.writing_df = pd.read_json(f"{path}/writing.json")

        self.joined_df = pd.merge(self.writing_df, self.directing_df, how='left', on='movie')

    def extract_special_chars(self):
        pattern = r'[a-zA-Z0-9 ,°!?@#$%&:;+~_/\-\"\'\^\*\(\)\.\[\]]'
        self.train_df['special_chars'] = self.train_df['primaryTitle'].apply(lambda x: re.sub(pattern, '', x))
        special_values_df = self.train_df[self.train_df['special_chars'] != ''][['special_chars']]
        self.index = special_values_df.index.tolist() 

        for chars in special_values_df['special_chars'].tolist():
            for char in chars:
                if char in self.mapper:
                    continue
                self.add_char_to_mapper(char)
        
        with open('mapper.json', 'w') as f:
            json.dump(self.mapper, f, indent=2)

    def add_char_to_mapper(self, char):
        hex_value = hex(ord(char))[2:]
        unicode = f"U+{hex_value.zfill(4).upper()}"

        response = requests.get(f"{self.url}/{unicode}")
        soup = BeautifulSoup(response.text, 'html.parser').table.find("tbody")

        rows = soup.find_all("tr")
        nr_rows = len(rows)
        for i, row in enumerate(rows):
            if nr_rows != i + 1:
                continue
            last_row = row
        td = last_row.find('td', {"class": 'second-column'})
        val = td.findChild().text.split(' ')[0]
        if re.match('[a-zA-Z]', val): 
            self.mapper[char] = val

    def restore_column(self):
        mask = self.train_df.index.isin(self.index)
        self.train_df.loc[mask, 'primaryTitle'] = self.train_df[mask]['primaryTitle'].apply(lambda x: self.replace_chars(x))

    def replace_chars(self, s):
        print(s)
        for char in self.mapper.keys():
            if char in s:
                s = s.replace(char, self.mapper[char]) 
        print(s)
        return s

        


if __name__ == '__main__':
    m = DataManager()
    m.restore_column()
    # with open('mapper.json', 'r', encoding='') as f:
    #     mapper = json.loads(f.read())
    # print(mapper)

FileNotFoundError: [Errno 2] No such file or directory: 'imdb/data/csv/train'

In [3]:
imdb_data_folder = 'imdb/data'

path = f"{imdb_data_folder}/csv/train" 

In [4]:
print(path)

imdb/data/csv/train


In [5]:
training_data_files = os.listdir(path=path)

FileNotFoundError: [Errno 2] No such file or directory: 'imdb/data/csv/train'

In [5]:
os.listdir('/Users/dj/Desktop')

['.Rhistory',
 'Test_Folder',
 '.DS_Store',
 '.localized',
 'Screenshot 2022-03-04 at 18.48.31.png',
 'Assignment 3 Big Data.pdf',
 'Causal Data Science Sample Paper-1',
 'barplot 09.52.39.png',
 'KeepInMind',
 '~$bewijs.txt',
 'heatmap.png',
 '~$ogress_DJ.docx']