# 10-process-data
> Importing, cleaning, testing, and saving data

This series of notebooks reflects operations to scrape, prepare, validate, and save the data.

#### Helpful packages and preliminaries

In [1]:
#data access and processing
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir('/data/p_dsi/teams2023/team9/')
os.getcwd()

'/gpfs52/data/p_dsi/teams2023/team9'

In [3]:
try:
    with open("Asurion_data.xlsx", 'r') as tempfile: # OSError if file exists or is invalid
        pass
except OSError as e:
    print(e)
    # handle error here

In [4]:
df = pd.read_excel("Asurion_data.xlsx")

In [5]:
display(df)

Unnamed: 0,phone model,phone size,phone color,claim,weeks_monday
0,alcatel axel,32gb,black,1,2022-01-24
1,alcatel axel,32gb,black,3,2022-11-14
2,alcatel axel,32gb,black,1,2022-11-21
3,alcatel axel,32gb,black,3,2022-11-28
4,alcatel axel,32gb,black,1,2022-12-12
...,...,...,...,...,...
26656,samsung galaxy z fold3 5g,256gb,silver,17,2023-01-30
26657,samsung galaxy z fold3 5g,256gb,silver,26,2023-02-06
26658,samsung galaxy z fold3 5g,256gb,silver,4,2023-02-13
26659,samsung galaxy z fold3 5g,512gb,black,61,2023-02-06


###  NA values 

In [6]:
df.isnull().values.any()

False

### Checking duplicated rows

In [7]:
duplicate = df[df.duplicated()]
 
print("Duplicate Rows :")

Duplicate Rows :


In [8]:
#df[df["phone model"] == 'iphone 14']

### Listing of types of variables

In [9]:
types = df.dtypes
print(types)

phone model             object
phone size              object
phone color             object
claim                    int64
weeks_monday    datetime64[ns]
dtype: object


In [10]:
df.groupby("phone model")["phone model"].count()

phone model
alcatel axel                    5
alcatel smartflip 4052r        23
alcatel tetra                   4
apple iphone 11              1511
apple iphone 11 pro           584
                             ... 
samsung galaxy z flip 5g       57
samsung galaxy z flip3 5g     138
samsung galaxy z flip4 5g      11
samsung galaxy z fold2 5g     124
samsung galaxy z fold3 5g     167
Name: phone model, Length: 128, dtype: int64

In [11]:
df.groupby("phone size")["phone size"].count()

phone size
128gb    9373
16gb     1322
256gb    5116
32gb     2242
4gb        48
512gb     922
64gb     6344
8gb        35
gb       1259
Name: phone size, dtype: int64

In [12]:
df.groupby("phone color")["phone color"].count()

phone color
black        5192
blue         2224
bronze        203
burgundy       15
coral         220
cream          29
glow          149
gold         3455
graphite      196
gray         4243
green        1552
lavender      141
midnight      189
orange          4
pink          303
platinum      316
purple        766
red          1430
silver       3666
starlight     120
titanium       79
violet        104
white        1638
yellow        427
Name: phone color, dtype: int64

In [56]:
df.groupby("weeks_monday")["weeks_monday"].count()

weeks_monday
2021-06-28    238
2021-07-05    246
2021-07-12    238
2021-07-19    236
2021-07-26    235
2021-08-02    247
2021-08-09    262
2021-08-16    256
2021-08-23    260
2021-08-30    258
2021-09-06    256
2021-09-13    255
2021-09-20    257
2021-09-27    251
2021-10-04    268
2021-10-11    263
2021-10-18    256
2021-10-25    265
2021-11-01    262
2021-11-08    267
2021-11-15    277
2021-11-22    276
2021-11-29    270
2021-12-06    271
2021-12-13    282
2021-12-20    269
2021-12-27    268
2022-01-03    277
2022-01-10    281
2022-01-17    275
             ... 
2022-07-25    371
2022-08-01    369
2022-08-08    383
2022-08-15    389
2022-08-22    382
2022-08-29    379
2022-09-05    374
2022-09-12    385
2022-09-19    380
2022-09-26    381
2022-10-03    396
2022-10-10    386
2022-10-17    353
2022-10-24    322
2022-10-31    387
2022-11-07    372
2022-11-14    345
2022-11-21    310
2022-11-28    359
2022-12-05    344
2022-12-12    326
2022-12-19    277
2022-12-26    290
2023-01-02    3

In [13]:
df.loc[df.loc[:, 'phone size'] == 'gb']

Unnamed: 0,phone model,phone size,phone color,claim,weeks_monday
15165,apple iphone x 256,gb,gray,48,2021-06-28
15166,apple iphone x 256,gb,gray,153,2021-07-05
15167,apple iphone x 256,gb,gray,119,2021-07-12
15168,apple iphone x 256,gb,gray,87,2021-07-19
15169,apple iphone x 256,gb,gray,115,2021-07-26
...,...,...,...,...,...
24497,samsung galaxy s6 edge plus 64,gb,platinum,1,2021-12-13
24498,samsung galaxy s6 edge plus 64,gb,platinum,1,2021-12-27
24499,samsung galaxy s6 edge plus 64,gb,platinum,1,2022-04-18
24500,samsung galaxy s6 edge plus 64,gb,platinum,1,2022-06-13


### Formatting issue of phone size column (Data Cleaning)

In [14]:
df_copy = df.copy()

In [15]:
size_num = df_copy.loc[(df_copy.loc[:, 'phone size'] == 'gb'), 'phone model'].str[-3:]
size_num = size_num.astype(str)

In [16]:
#df_copy['phone size'] = 'size_num' + df_copy['phone size']
display(size_num)

15165    256
15166    256
15167    256
15168    256
15169    256
        ... 
24497     64
24498     64
24499     64
24500     64
24501     64
Name: phone model, Length: 1259, dtype: object

In [17]:
df_copy.loc[(df_copy.loc[:, 'phone size'] == 'gb'), 'phone size'] = size_num + df_copy.loc[(df_copy.loc[:, 'phone size'] == 'gb'), 'phone size']

In [18]:
df_copy.groupby("phone size")["phone size"].count()

phone size
 16gb       7
 32gb     335
 64gb     510
128gb    9382
16gb     1322
256gb    5514
32gb     2242
4gb        48
512gb     922
64gb     6344
8gb        35
Name: phone size, dtype: int64

### Creating the clean dataset

In [19]:
df2 = pd.read_csv('Asurion_clean_data.csv')
display(df2)
#list(df2.columns)

Unnamed: 0.1,Unnamed: 0,phone model,phone size,phone color,claim,weeks_monday
0,0,alcatel axel,32gb,black,1,2022-01-24
1,1,alcatel axel,32gb,black,3,2022-11-14
2,2,alcatel axel,32gb,black,1,2022-11-21
3,3,alcatel axel,32gb,black,3,2022-11-28
4,4,alcatel axel,32gb,black,1,2022-12-12
...,...,...,...,...,...,...
26656,26656,samsung galaxy z fold3 5g,256gb,silver,17,2023-01-30
26657,26657,samsung galaxy z fold3 5g,256gb,silver,26,2023-02-06
26658,26658,samsung galaxy z fold3 5g,256gb,silver,4,2023-02-13
26659,26659,samsung galaxy z fold3 5g,512gb,black,61,2023-02-06


In [21]:
# Dropping the extra index column
df2 = df2.drop('Unnamed: 0', axis = 1)
display(df2)

Unnamed: 0,phone model,phone size,phone color,claim,weeks_monday
0,alcatel axel,32gb,black,1,2022-01-24
1,alcatel axel,32gb,black,3,2022-11-14
2,alcatel axel,32gb,black,1,2022-11-21
3,alcatel axel,32gb,black,3,2022-11-28
4,alcatel axel,32gb,black,1,2022-12-12
...,...,...,...,...,...
26656,samsung galaxy z fold3 5g,256gb,silver,17,2023-01-30
26657,samsung galaxy z fold3 5g,256gb,silver,26,2023-02-06
26658,samsung galaxy z fold3 5g,256gb,silver,4,2023-02-13
26659,samsung galaxy z fold3 5g,512gb,black,61,2023-02-06


In [25]:
#df2.to_csv('Asurion_clean_data.csv', index = False)

#df_new = pd.read_csv('Asurion_clean_data.csv')
#display(df_new)

Unnamed: 0,phone model,phone size,phone color,claim,weeks_monday
0,alcatel axel,32gb,black,1,2022-01-24
1,alcatel axel,32gb,black,3,2022-11-14
2,alcatel axel,32gb,black,1,2022-11-21
3,alcatel axel,32gb,black,3,2022-11-28
4,alcatel axel,32gb,black,1,2022-12-12
...,...,...,...,...,...
26656,samsung galaxy z fold3 5g,256gb,silver,17,2023-01-30
26657,samsung galaxy z fold3 5g,256gb,silver,26,2023-02-06
26658,samsung galaxy z fold3 5g,256gb,silver,4,2023-02-13
26659,samsung galaxy z fold3 5g,512gb,black,61,2023-02-06
