In [1]:
import pandas

In [2]:
devices = pandas.read_csv("./files/search_result.csv")

In [3]:
devices = devices.dropna(subset=["name"])

is_smartphone = devices["name"].str.contains("SMARTPHONE", case=False)
is_tablet = devices["name"].str.contains("TABLET", case=False)

devices = devices[is_smartphone | is_tablet]

devices

Unnamed: 0,name,store_count,device_model_searched,price_cents
0,Smartphone Motorola Moto G G4 Plus Usado 32GB ...,1.0,Moto G4 Plus,40400.0
1,"Smartphone Motorola Moto G4 Plus XT1640 16,0 M...",3.0,Moto G4 Plus,72900.0
2,Smartphone Samsung Galaxy S7 SM-G930F 32GB 12....,5.0,Galaxy S7,199900.0
3,Smartphone Samsung Galaxy S7 Edge SM-G935F 32G...,4.0,Galaxy S7,249500.0
4,Smartphone Samsung Galaxy S7 Usado 32GB 12.0 M...,1.0,Galaxy S7,68300.0
...,...,...,...,...
6183,Smartphone Xiaomi Redmi Note 4X 32GB Qualcomm ...,1.0,Redmi 8,101900.0
6184,Smartphone Xiaomi Redmi 5 Plus 32GB 12.0 MP Qu...,1.0,Redmi 8,119900.0
6185,Smartphone LG K40S LMX430BMW 32GB Câmera Dupla...,11.0,LG K40S,62800.0
6186,Smartphone Samsung Galaxy A30s SM-A307G 64GB C...,17.0,Galaxy A30s,127500.0


In [4]:
devices.to_csv("./files/devices.csv")

In [5]:
device_groups = devices.groupby("device_model_searched")
devices_price_info = device_groups["price_cents"].describe().round()
devices_price_info = devices_price_info[devices_price_info["mean"].notnull()]
devices_price_info = devices_price_info.reset_index()

In [6]:
devices_price_info.to_csv("./files/devices_price.csv")

In [7]:
parquet_file = pandas.read_parquet("./files/flurry_user_event_date.parquet", columns=["userId", "deviceModel"])
parquet_file = parquet_file.drop_duplicates()
parquet_file = parquet_file.dropna(subset=["userId", "deviceModel"])
parquet_file

Unnamed: 0,userId,deviceModel
0,aff3df3b88b1e955,Moto G4 Plus
2,1111,Moto G4 Plus
119,123,Moto G4 Plus
128,0,Moto G4 Plus
129,324,Moto G4 Plus
...,...,...
47313265,113641,iPhone 6s
47313479,222611,iPhone XR
47313604,222419,iPhone XR
47313676,222382,iPhone 8 Plus


In [8]:
def extractor(row):
    idx = row[0]
    user_id = row[1]
    user_device_model = row[2]
    device_model_searched = devices_price_info[devices_price_info["device_model_searched"] == user_device_model]
    device_model_mean_price = device_model_searched["mean"].values[0] if not device_model_searched.empty else None
    classification = None

    if device_model_mean_price:
        if device_model_mean_price > 0 and device_model_mean_price <= 75000:
            classification = "low_range"
        elif device_model_mean_price > 75000 and device_model_mean_price <= 150000:
            classification = "mid_range"
        else:
            classification = "high_end"

    return { 
        "user_id": user_id,
        "user_device_model": user_device_model,
        "device_model_mean_price": device_model_mean_price,
        "classification": classification
    }

classifications_data_frame = pandas.DataFrame(list(map(extractor, parquet_file.itertuples())))
classifications_data_frame

Unnamed: 0,user_id,user_device_model,device_model_mean_price,classification
0,aff3df3b88b1e955,Moto G4 Plus,56650.0,low_range
1,1111,Moto G4 Plus,56650.0,low_range
2,123,Moto G4 Plus,56650.0,low_range
3,0,Moto G4 Plus,56650.0,low_range
4,324,Moto G4 Plus,56650.0,low_range
...,...,...,...,...
181029,113641,iPhone 6s,173736.0,high_end
181030,222611,iPhone XR,372133.0,high_end
181031,222419,iPhone XR,372133.0,high_end
181032,222382,iPhone 8 Plus,279114.0,high_end


In [10]:
classifications_data_frame.to_csv("./files/classification.csv")