In [2]:
import pandas as pd

In [3]:
# 1 & 2 - Reading the csv file
df = pd.read_csv("lab1_data.csv", sep=";")
print(df)

          id             name          price currency  created_at
0   SKU-1001           shoes             799      SEK  2024-01-10
1   SKU-1002            pants            520      SEK  2024/02/15
2   SKU-1003           shirts            450      SEK         NaN
3        NaN           jacket           -200     SEK   2024-13-01
4   SKU-1005              NaN            NaN      NaN            
5   SKU-1006    designer coat            NaN      SEK  2024-05-01
6        NaN              hat           free      SEK         NaN
7   SKU-1008          gloves             120      NaN  2024-06-20
8   SKU-1009            socks             99      SEK  2024-02-01
9   SKU-1010            boots           1500      SEK  2024-02-15
10  SKU-1011            scarf            200      SEK  2024-03-01
11  SKU-1012             belt            300      SEK  2024/03/10
12  SKU-1013              cap             75      SEK  2024-04-01
13  SKU-1014          t-shirt              0      SEK  2024-04-10
14  SKU-10

In [None]:
# 3 - DATA PREPARATION 

In [5]:
# Removing white spaces 
dirty_df = df
dirty_df[["id", "name", "price", "currency", "created_at"]] = dirty_df[["id", "name", "price", "currency", "created_at"]].apply(lambda x: x.str.strip())
dirty_df 

Unnamed: 0,id,name,price,currency,created_at
0,SKU-1001,shoes,799,SEK,2024-01-10
1,SKU-1002,pants,520,SEK,2024/02/15
2,SKU-1003,shirts,450,SEK,
3,,jacket,-200,SEK,2024-13-01
4,SKU-1005,,,,
5,SKU-1006,designer coat,,SEK,2024-05-01
6,,hat,free,SEK,
7,SKU-1008,gloves,120,,2024-06-20
8,SKU-1009,socks,99,SEK,2024-02-01
9,SKU-1010,boots,1500,SEK,2024-02-15


In [6]:
# Dates format
dirty_df["created_at"] = pd.to_datetime(dirty_df["created_at"], errors='coerce')
dirty_df

Unnamed: 0,id,name,price,currency,created_at
0,SKU-1001,shoes,799,SEK,2024-01-10
1,SKU-1002,pants,520,SEK,NaT
2,SKU-1003,shirts,450,SEK,NaT
3,,jacket,-200,SEK,NaT
4,SKU-1005,,,,NaT
5,SKU-1006,designer coat,,SEK,2024-05-01
6,,hat,free,SEK,NaT
7,SKU-1008,gloves,120,,2024-06-20
8,SKU-1009,socks,99,SEK,2024-02-01
9,SKU-1010,boots,1500,SEK,2024-02-15


In [7]:
# From INTEGER ---> FLOAT
dirty_df["price"] = pd.to_numeric(dirty_df["price"], errors="coerce")
dirty_df

# index 3 and 14: neg price
# index 13: free ---> 0.0

Unnamed: 0,id,name,price,currency,created_at
0,SKU-1001,shoes,799.0,SEK,2024-01-10
1,SKU-1002,pants,520.0,SEK,NaT
2,SKU-1003,shirts,450.0,SEK,NaT
3,,jacket,-200.0,SEK,NaT
4,SKU-1005,,,,NaT
5,SKU-1006,designer coat,,SEK,2024-05-01
6,,hat,,SEK,NaT
7,SKU-1008,gloves,120.0,,2024-06-20
8,SKU-1009,socks,99.0,SEK,2024-02-01
9,SKU-1010,boots,1500.0,SEK,2024-02-15


In [8]:
# 4 - Flagging the Nan or Nat 
checked_columns = ["id", "name", "price", "currency", "created_at"]
dirty_df["is_flagged"] = dirty_df[checked_columns].isna().any(axis=1)|(dirty_df["price"] < 0)
dirty_df

Unnamed: 0,id,name,price,currency,created_at,is_flagged
0,SKU-1001,shoes,799.0,SEK,2024-01-10,False
1,SKU-1002,pants,520.0,SEK,NaT,True
2,SKU-1003,shirts,450.0,SEK,NaT,True
3,,jacket,-200.0,SEK,NaT,True
4,SKU-1005,,,,NaT,True
5,SKU-1006,designer coat,,SEK,2024-05-01,True
6,,hat,,SEK,NaT,True
7,SKU-1008,gloves,120.0,,2024-06-20,True
8,SKU-1009,socks,99.0,SEK,2024-02-01,False
9,SKU-1010,boots,1500.0,SEK,2024-02-15,False


In [18]:
# 5 - Avvisa omöjliga värden 
rejected_condition = ( 
    (dirty_df["id"].isna()) | 
    (dirty_df["price"] < 0) | 
    (dirty_df)["price"].isna() |
    ( dirty_df["currency"].isna()) 
    )

rejected_df = dirty_df[rejected_condition].copy()
accepted_df = dirty_df[~rejected_condition].copy()

In [19]:
rejected_df

# pdf 6 sida 52 reject när obligatoriskt fält saknas = currency 
# pris at 0.0 is not an issue as string was "free"
# outliers --->  flagged

Unnamed: 0,id,name,price,currency,created_at,is_flagged
3,,jacket,-200.0,SEK,NaT,True
4,SKU-1005,,,,NaT,True
5,SKU-1006,designer coat,,SEK,2024-05-01,True
6,,hat,,SEK,NaT,True
7,SKU-1008,gloves,120.0,,2024-06-20,True
14,SKU-1015,hoodie,-50.0,SEK,2024-04-15,True
42,SKU-1043,cardigan,,SEK,2024-11-15,True
51,SKU-1052,coat,,SEK,2025-01-10,True
54,SKU-1055,scarf,300.0,,2025-02-05,True


In [11]:
# accepted_df - uncomment if needed

In [12]:
# 6 - analytics_summary preparation with utlity methods
# From accepted_df
analytics = {
    "snitpris" : [accepted_df['price'].mean().round()],
    "medianpris" : [accepted_df['price'].median()],
    "antal_produkter" : [len(accepted_df)],
    "antal_produkter_inget_pris" : [accepted_df['price'].isna().sum()],
    }
analytics

{'snitpris': [np.float64(23456.0)],
 'medianpris': [np.float64(600.0)],
 'antal_produkter': [46],
 'antal_produkter_inget_pris': [np.int64(0)]}

In [13]:
# From dict ---> DataFrame
analytics_summary = pd.DataFrame(analytics)
analytics_summary.to_csv("analytics_summary.csv", index=False)
print("analytics_summary skapad:")

analytics_summary skapad:


In [14]:
# price_analysis.csv - Top 10 dyraste produkter
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
p_analyis = pd.DataFrame(accepted_df).sort_values(by=["price"], ascending=False).head(10)
p_analyis


Unnamed: 0,id,name,price,currency,created_at,is_flagged
38,SKU-1039,coat,999999.0,SEK,2024-10-15,False
20,SKU-1021,ring,25000.0,SEK,2024-06-05,False
19,SKU-1020,watch,15000.0,SEK,2024-06-01,False
31,SKU-1032,suit,5000.0,SEK,2024-09-01,False
39,SKU-1040,parka,4500.0,SEK,2024-11-01,False
26,SKU-1027,winter jacket,3500.0,SEK,2024-07-15,False
15,SKU-1016,coat,2500.0,SEK,2024-05-01,False
33,SKU-1034,blazer,2200.0,SEK,2024-09-10,False
9,SKU-1010,boots,1500.0,SEK,2024-02-15,False
46,SKU-1047,heels,1500.0,SEK,2024-12-15,False


In [15]:
# Top 10 produkter med mest avvikande pris 
accepted_df.describe()

Unnamed: 0,price,created_at
count,46.0,43
mean,23455.586957,2024-08-15 01:06:58.604651264
min,0.0,2024-01-10 00:00:00
25%,262.5,2024-06-03 00:00:00
50%,600.0,2024-08-15 00:00:00
75%,1175.0,2024-11-07 12:00:00
max,999999.0,2025-02-01 00:00:00
std,147243.345589,


In [16]:
# avvikande pris
# accepted_df['price'].median()
# accepted_df['outliers'] = (accepted_df['price'] - median_price).abs()
# needs .copy() mayeb not to chnage the source accepted_df
# top_10_outliers = accepted_df.nlargest(10, 'outliers')

In [17]:
accepted_df.describe()

Unnamed: 0,price,created_at
count,46.0,43
mean,23455.586957,2024-08-15 01:06:58.604651264
min,0.0,2024-01-10 00:00:00
25%,262.5,2024-06-03 00:00:00
50%,600.0,2024-08-15 00:00:00
75%,1175.0,2024-11-07 12:00:00
max,999999.0,2025-02-01 00:00:00
std,147243.345589,
