# Reading Excel and Spreadsheet files

In [1]:
# import the pandas library
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

print(pd.__version__)

2.1.1


## Reading Excel data

You can read the data with the help of **`.read_excel()`** function.

In [2]:
# read the dataset
data = pd.read_excel('datasets/big_mart_sales.xlsx')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Different challenges with Excel files

### Excel files with multiple sheets

- Excel files most of the time contain multiple sheets. 
- Using **`sheet_name`** parameter we can read a specific sheet.

In [4]:
# read the data with multiple sheets
multiple_data = pd.read_excel('datasets/big_mart_sales_with_multiple_sheets.xlsx')
print(multiple_data['Outlet_Establishment_Year'].unique())

[1985]


In [5]:
# Read the data in a specific sheet
sheet = pd.read_excel('datasets/big_mart_sales_with_multiple_sheets.xlsx', sheet_name = '1985')
sheet.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
1,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
2,FDW12,,Regular,0.0354,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432
3,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
4,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362


In [6]:
# Similarly, for 1987 and 1997
sheet1 = pd.read_excel('datasets/big_mart_sales_with_multiple_sheets.xlsx', sheet_name = '1987')
sheet2 = pd.read_excel('datasets/big_mart_sales_with_multiple_sheets.xlsx', sheet_name = '1997')

print(sheet1.shape, sheet2.shape)

(932, 12) (930, 12)


In [8]:
# Create an array of dataframes
sheets_data = [sheet, sheet1, sheet2]

# Concatenate the array
final_data = pd.concat(sheets_data)

final_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
1,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
2,FDW12,,Regular,0.0354,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432
3,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
4,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362


In [9]:
print('Shape:', final_data.shape)

Shape: (3325, 12)


### Read the data except the first few rows in the file

Pass the parameter **`skiprows = n (number of rows to skip)`**.

In [10]:
# read the data without skiprows
data = pd.read_excel('datasets/big_mart_sales_comments.xlsx')
data.head()

Unnamed: 0,# This is Big Mart Sales Data,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,# This is Big Mart Sales Data,,,,,,,,,,,
1,# This is Big Mart Sales Data,,,,,,,,,,,
2,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
4,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228


In [11]:
# read the data with skiprows
data = pd.read_excel('datasets/big_mart_sales_comments.xlsx', skiprows = 3)
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Reading data from multiple directories

Use the **`glob`** library to list the files in a directory.

In [12]:
# import the library
import glob

# list to store data frames
merged_data = []

# list all the files in the folder
for i in glob.glob('datasets/multi-directory-excel/*'):
    print(i)

    # List all the files present in the sub-folder
    for file in glob.glob(i + '/*'):
        print(file)
        # Add to the list
        merged_data.append(pd.read_csv(file))

datasets/multi-directory-excel\1985
datasets/multi-directory-excel\1985\1985.xlsx
datasets/multi-directory-excel\1987
datasets/multi-directory-excel\1987\1987.xlsx
datasets/multi-directory-excel\1997
datasets/multi-directory-excel\1997\1997.xlsx
datasets/multi-directory-excel\1998
datasets/multi-directory-excel\1998\1998.xlsx
datasets/multi-directory-excel\1999
datasets/multi-directory-excel\1999\1999.xlsx
datasets/multi-directory-excel\2002
datasets/multi-directory-excel\2002\2002.xlsx
datasets/multi-directory-excel\2004
datasets/multi-directory-excel\2004\2004.xlsx
datasets/multi-directory-excel\2007
datasets/multi-directory-excel\2007\2007.xlsx
datasets/multi-directory-excel\2009
datasets/multi-directory-excel\2009\2009.xlsx


In [13]:
# concatenate the dataframes
final_data = pd.concat(merged_data)
final_data.head()

Unnamed: 0.1,Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
1,18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
2,21,FDW12,,Regular,0.0354,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432
3,23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
4,29,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
