In [8]:
pip install kaggle



To use the Kaggle API, you'll need to set up your API key. Here's how:

1. Go to your Kaggle account settings page: [https://www.kaggle.com/me/account](https://www.kaggle.com/me/account)
2. Scroll down to the "API" section and click "Create New API Token". This will download a `kaggle.json` file to your computer.
3. In Colab, click on the "🔑 Secrets" tab in the left sidebar.
4. Click "Add new secret" and enter `KAGGLE_JSON` as the name.
5. Open the `kaggle.json` file you downloaded and copy its content. Paste the content into the "Value" field for the `KAGGLE_JSON` secret.
6.  Add a new secret with name `KAGGLE_USERNAME` and value of your kaggle username.
7.  Add a new secret with name `KAGGLE_KEY` and value of your kaggle key.

In [61]:
pip install pymysql

Collecting pymysql
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.1.1


In [65]:
pip install sqlalchemy



In [1]:
import os
from google.colab import userdata

# Set environment variables for Kaggle
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

In [2]:
#download the data set

!kaggle datasets download najir0123/walmart-10k-sales-datasets

Dataset URL: https://www.kaggle.com/datasets/najir0123/walmart-10k-sales-datasets
License(s): MIT
Downloading walmart-10k-sales-datasets.zip to /content
  0% 0.00/143k [00:00<?, ?B/s]
100% 143k/143k [00:00<00:00, 219MB/s]


In [3]:
#Unzip the file
!unzip /content/walmart-10k-sales-datasets.zip


Archive:  /content/walmart-10k-sales-datasets.zip
  inflating: Walmart.csv             


In [5]:
#immporting dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
df=pd.read_csv('/content/Walmart.csv')

In [7]:
df.shape

(10051, 11)

In [8]:
df.head()

Unnamed: 0,invoice_id,Branch,City,category,unit_price,quantity,date,time,payment_method,rating,profit_margin
0,1,WALM003,San Antonio,Health and beauty,$74.69,7.0,05/01/19,13:08:00,Ewallet,9.1,0.48
1,2,WALM048,Harlingen,Electronic accessories,$15.28,5.0,08/03/19,10:29:00,Cash,9.6,0.48
2,3,WALM067,Haltom City,Home and lifestyle,$46.33,7.0,03/03/19,13:23:00,Credit card,7.4,0.33
3,4,WALM064,Bedford,Health and beauty,$58.22,8.0,27/01/19,20:33:00,Ewallet,8.4,0.33
4,5,WALM013,Irving,Sports and travel,$86.31,7.0,08/02/19,10:37:00,Ewallet,5.3,0.48


In [9]:
df.tail()

Unnamed: 0,invoice_id,Branch,City,category,unit_price,quantity,date,time,payment_method,rating,profit_margin
10046,9996,WALM056,Rowlett,Fashion accessories,$37,3.0,03/08/23,10:10:00,Cash,3.0,0.33
10047,9997,WALM030,Richardson,Home and lifestyle,$58,2.0,22/02/21,14:20:00,Cash,7.0,0.48
10048,9998,WALM050,Victoria,Fashion accessories,$52,3.0,15/06/23,16:00:00,Credit card,4.0,0.48
10049,9999,WALM032,Tyler,Home and lifestyle,$79,2.0,25/02/21,12:25:00,Cash,7.0,0.48
10050,10000,WALM069,Rockwall,Fashion accessories,$62,3.0,26/09/20,9:48:00,Cash,3.0,0.33


In [10]:
df.describe()

Unnamed: 0,invoice_id,quantity,rating,profit_margin
count,10051.0,10020.0,10051.0,10051.0
mean,5025.74122,2.353493,5.825659,0.393791
std,2901.174372,1.602658,1.763991,0.090669
min,1.0,1.0,3.0,0.18
25%,2513.5,1.0,4.0,0.33
50%,5026.0,2.0,6.0,0.33
75%,7538.5,3.0,7.0,0.48
max,10000.0,10.0,10.0,0.57


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10051 entries, 0 to 10050
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   invoice_id      10051 non-null  int64  
 1   Branch          10051 non-null  object 
 2   City            10051 non-null  object 
 3   category        10051 non-null  object 
 4   unit_price      10020 non-null  object 
 5   quantity        10020 non-null  float64
 6   date            10051 non-null  object 
 7   time            10051 non-null  object 
 8   payment_method  10051 non-null  object 
 9   rating          10051 non-null  float64
 10  profit_margin   10051 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 863.9+ KB


In [12]:
#no. of duplicates
df.duplicated().sum()

np.int64(51)

In [13]:
#delete duplicate entries
df = df.drop_duplicates()
#verify
df.duplicated().sum()

np.int64(0)

In [14]:
#no. of null values
df.isnull().sum()

Unnamed: 0,0
invoice_id,0
Branch,0
City,0
category,0
unit_price,31
quantity,31
date,0
time,0
payment_method,0
rating,0


In [15]:
#dropping all raws with missing values
df = df.dropna()
#verify
df.isnull().sum()

Unnamed: 0,0
invoice_id,0
Branch,0
City,0
category,0
unit_price,0
quantity,0
date,0
time,0
payment_method,0
rating,0


In [16]:
df.shape

(9969, 11)

In [20]:
#converting unit_proce column into float value
df['unit_price']=df['unit_price'].str.replace('$', '').astype(float)

In [21]:
df.head()

Unnamed: 0,invoice_id,Branch,City,category,unit_price,quantity,date,time,payment_method,rating,profit_margin
0,1,WALM003,San Antonio,Health and beauty,74.69,7.0,05/01/19,13:08:00,Ewallet,9.1,0.48
1,2,WALM048,Harlingen,Electronic accessories,15.28,5.0,08/03/19,10:29:00,Cash,9.6,0.48
2,3,WALM067,Haltom City,Home and lifestyle,46.33,7.0,03/03/19,13:23:00,Credit card,7.4,0.33
3,4,WALM064,Bedford,Health and beauty,58.22,8.0,27/01/19,20:33:00,Ewallet,8.4,0.33
4,5,WALM013,Irving,Sports and travel,86.31,7.0,08/02/19,10:37:00,Ewallet,5.3,0.48


In [22]:
df.columns

Index(['invoice_id', 'Branch', 'City', 'category', 'unit_price', 'quantity',
       'date', 'time', 'payment_method', 'rating', 'profit_margin'],
      dtype='object')

In [23]:
#creating a new column in the dataset
df['total_price'] = df['quantity'] * df['unit_price']

df.head()

Unnamed: 0,invoice_id,Branch,City,category,unit_price,quantity,date,time,payment_method,rating,profit_margin,total_price
0,1,WALM003,San Antonio,Health and beauty,74.69,7.0,05/01/19,13:08:00,Ewallet,9.1,0.48,522.83
1,2,WALM048,Harlingen,Electronic accessories,15.28,5.0,08/03/19,10:29:00,Cash,9.6,0.48,76.4
2,3,WALM067,Haltom City,Home and lifestyle,46.33,7.0,03/03/19,13:23:00,Credit card,7.4,0.33,324.31
3,4,WALM064,Bedford,Health and beauty,58.22,8.0,27/01/19,20:33:00,Ewallet,8.4,0.33,465.76
4,5,WALM013,Irving,Sports and travel,86.31,7.0,08/02/19,10:37:00,Ewallet,5.3,0.48,604.17


In [24]:
# save the df as csv file
df.to_csv('walmart_updated_data.csv')