In [7]:
import json
import data_cleaner
import pandas as pd

In [8]:
import os

# Specify the directory path
directory_path = 'data/'

# Get a list of files in the directory
file_names = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
file_names

['Alcatel_data.json',
 'Apple_data.json',
 'Asus_data.json',
 'Blu_data.json',
 'HTC_data.json',
 'Huawei_data.json',
 'Infinix_data.json',
 'Lenovo_data.json',
 'LG_data.json',
 'Nokia_data.json',
 'Samsung_data.json',
 'Sony_data.json',
 'Xiaomi_data.json',
 'ZTE_data.json']

In [9]:
dataframes = []
for f in file_names:
    file_path = directory_path + f
    with open(file_path, 'r') as file:
        data = json.load(file)
    parsed_data = []
    for brand, products in data.items():
        for product_name, spec in products.items():
            if "watch" in product_name.lower():
                continue
            lunch_data = data_cleaner.parse_date(spec.get('Launch', list(dict())))
            if not lunch_data[1] or lunch_data[0] > 6:
                continue
            if lunch_data[1] == 2:
                continue
            os = data_cleaner.os_parser(spec.get("Platform", list(dict())))
            if os not in [1, 2]:
                continue
            display_information = data_cleaner.parse_display_information(spec.get("Display", list(dict())))
            memory_data = data_cleaner.parse_memory_data(spec.get("Memory", list(dict())))
            if 0 in memory_data:
                continue
            product_features = {
                "brand": "".join(char for char in brand if not char.isdigit()),
                "product_name": product_name,
                "5g": data_cleaner.parse_5g(spec.get("Network", list(dict()))),
                "years_since_release": lunch_data[0],
                "resolution": display_information[0],
                "size": display_information[1],
                "os": data_cleaner.os_parser(spec.get("Platform", list(dict()))),
                "battery": data_cleaner.get_battery(spec.get("Battery", list(dict()))),
                "ram": memory_data[0],
                "storage": memory_data[1],
                "price": data_cleaner.parse_price(spec.get("Misc", list(dict()))),
            }
            parsed_data.append(product_features)
    df = pd.DataFrame(parsed_data)
    dataframes.append(df)


In [10]:
detailed_df = pd.concat(dataframes,ignore_index=True)
detailed_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173 entries, 0 to 1172
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                1173 non-null   object 
 1   product_name         1173 non-null   object 
 2   5g                   1173 non-null   int64  
 3   years_since_release  1173 non-null   int64  
 4   resolution           1173 non-null   int64  
 5   size                 1173 non-null   float64
 6   os                   1173 non-null   int64  
 7   battery              1173 non-null   int64  
 8   ram                  1173 non-null   float64
 9   storage              1173 non-null   float64
 10  price                1025 non-null   float64
dtypes: float64(4), int64(5), object(2)
memory usage: 100.9+ KB


In [11]:
detailed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173 entries, 0 to 1172
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                1173 non-null   object 
 1   product_name         1173 non-null   object 
 2   5g                   1173 non-null   int64  
 3   years_since_release  1173 non-null   int64  
 4   resolution           1173 non-null   int64  
 5   size                 1173 non-null   float64
 6   os                   1173 non-null   int64  
 7   battery              1173 non-null   int64  
 8   ram                  1173 non-null   float64
 9   storage              1173 non-null   float64
 10  price                1025 non-null   float64
dtypes: float64(4), int64(5), object(2)
memory usage: 100.9+ KB


In [12]:
detailed_df.isnull().sum()

brand                    0
product_name             0
5g                       0
years_since_release      0
resolution               0
size                     0
os                       0
battery                  0
ram                      0
storage                  0
price                  148
dtype: int64

In [13]:
detailed_df

Unnamed: 0,brand,product_name,5g,years_since_release,resolution,size,os,battery,ram,storage,price
0,alcatel,1B (2022),0,3,1036800,5.50,1,3000,2.0,32.0,103.0
1,alcatel,1L Pro (2021),0,4,1123200,6.10,1,3000,2.0,32.0,113.3
2,alcatel,1 (2021),0,4,460800,5.00,1,2000,1.0,8.0,61.8
3,alcatel,3L (2021),0,4,1152000,6.52,1,4000,4.0,64.0,339.9
4,alcatel,1S (2021),0,4,1152000,6.52,1,4000,3.0,32.0,133.9
...,...,...,...,...,...,...,...,...,...,...,...
1168,ZTE,Axon 10 Pro 5G,1,6,2527200,6.47,1,4000,6.0,128.0,927.0
1169,ZTE,Axon 10 Pro,0,6,2527200,6.47,1,4000,6.0,128.0,484.1
1170,ZTE,Blade V10,0,6,2462400,6.30,1,3200,3.0,32.0,
1171,ZTE,Blade V10 Vita,0,6,1094400,6.26,1,3200,2.0,32.0,
