# Pandas

In [2]:
# Pandas is a powerful data manipulation library in Python, widely used for data analysis and data cleaning. It provides two primary data structures: Series and DataFrame.
# A series is a one dimensional array like object, while a DataFrame is a two dimensional, size mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns).

!pip install pandas

Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.0 MB 3.7 MB/s eta 0:00:03
   --- ------------------------------------ 1.0/11.0 MB 3.9 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/11.0 MB 3.4 MB/s eta 0:00:03
   ---------- ----------------------------- 2.9/11.0 MB 3.4 MB/s eta 0:00:03
   ------------- -------------------------- 3.7/11.0 MB 3.6 MB/s eta 0:00:03
   --------------- ------------------------ 4.2/11.0 MB 3.6 MB/s eta 0:00:02
   -------------------- ----------------

In [6]:
# Series : A pandas series ia a one-dimensional array-like object that can hold any data type such as integers, floats, strings, etc. It is similar to a column in a spreadsheet or a SQL table.

import pandas as pd
import numpy as np

In [4]:
data = [1, 2, 3, 4, 5]
series = pd.Series(data)
print(series)


0    1
1    2
2    3
3    4
4    5
dtype: int64


In [5]:
# Create a series from a dictionary
data_dict = {'a': 1, 'b': 2, 'c': 3}
series_dict = pd.Series(data_dict)
print(series_dict)

a    1
b    2
c    3
dtype: int64


In [7]:
data = np.array([1,2,3,4])
indexing = ['a', 'b', 'c', 'd']
series_indexed = pd.Series(data, index=indexing)
print(series_indexed)

a    1
b    2
c    3
d    4
dtype: int64


In [None]:
# A series is a single column, whereas a DataFrame is a multi-dimensional table made up of a collection of Series. Each column in a DataFrame can have a different data type (e.g., integer, float, string, etc.), making it suitable for handling heterogeneous data.

In [17]:
# DataFrame : A pandas DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns). It is similar to a spreadsheet or a SQL table.

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df1 = pd.DataFrame(data)
print(df1)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [10]:
np.array(df) # Convert DataFrame to NumPy array

array([['Alice', 25, 'New York'],
       ['Bob', 30, 'Los Angeles'],
       ['Charlie', 35, 'Chicago']], dtype=object)

In [11]:
print(type(df)) # Check the type of df

<class 'pandas.core.frame.DataFrame'>


In [12]:
# Create a dataframe from a list of dictionaries
data_list = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'Los Angeles'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Chicago'}
]

df_from_list = pd.DataFrame(data_list)
print(df_from_list)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [13]:
df = pd.read_csv('Car_data.csv') # Read a CSV file into a DataFrame
print(df.head()) # Display the first few rows of the DataFrame

   Unnamed: 0                    name   company  year   Price  kms_driven  \
0           0     Hyundai Santro Xing   Hyundai  2007   80000       45000   
1           1     Mahindra Jeep CL550  Mahindra  2006  425000          40   
2           2       Hyundai Grand i10   Hyundai  2014  325000       28000   
3           3  Ford EcoSport Titanium      Ford  2014  575000       36000   
4           4               Ford Figo      Ford  2012  175000       41000   

  fuel_type  
0    Petrol  
1    Diesel  
2    Petrol  
3    Diesel  
4    Diesel  


In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,4,Ford Figo,Ford,2012,175000,41000,Diesel


In [15]:
df.tail()

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
811,811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,814,Tata Zest XM,Tata,2018,260000,27000,Diesel
815,815,Mahindra Quanto C8,Mahindra,2013,390000,40000,Diesel


In [16]:
# Accessing data from a DataFrame
# Access a single column

df

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...,...
811,811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [18]:
df1

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [19]:
df1['Age']  # Accessing a single column

0    25
1    30
2    35
Name: Age, dtype: int64

In [20]:
type(df1['Age'])  # This will return <class 'pandas.core.series.Series'>

pandas.core.series.Series

In [None]:
df1.loc[0]  # Accessing a single row by label (index), loc stands for location

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [None]:
df1.iloc[1]  # Accessing a single row by integer location, iloc stands for integer location

Name            Bob
Age              30
City    Los Angeles
Name: 1, dtype: object

In [26]:
df1.iloc[0][2]  # Accessing a specific value by row and column index

  df1.iloc[0][2]  # Accessing a specific value by row and column index


'New York'

In [30]:
# Accessing a specified element

df1['Age']  # Accessing the 'Age' of the first row

0    25
1    30
2    35
Name: Age, dtype: int64

In [32]:
df1

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [33]:
df1.at[0,'Age']  # Accessing a specific value by row label and column label

np.int64(25)

In [34]:
# Accessing a specific element using at and iat

df1.at[0, 'Age']  # Accessing the 'Age' of the first row using at
df1.iat[0, 2]  # Accessing the 'Age' of the first row using iat

'New York'

In [35]:
df1['Salary'] = [50000, 60000, 70000]  # Adding a new column 'Salary' to the DataFrame

df1

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000
1,Bob,30,Los Angeles,60000
2,Charlie,35,Chicago,70000


In [None]:
# remove a column
df1.drop('Salary', axis=1) # Temporarily removes the 'Salary' column

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [37]:
df1

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000
1,Bob,30,Los Angeles,60000
2,Charlie,35,Chicago,70000


In [38]:
# Permanently remove a column
df1.drop('Salary', axis=1, inplace=True) # Permanently removes the '

In [39]:
df1

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [40]:
# Increase age by 1 for all rows
df1['Age'] = df1['Age']+1
df1

Unnamed: 0,Name,Age,City
0,Alice,26,New York
1,Bob,31,Los Angeles
2,Charlie,36,Chicago


In [41]:
df1.drop(0,axis=0,inplace=True) # Permanently removes the first row

In [42]:
df1

Unnamed: 0,Name,Age,City
1,Bob,31,Los Angeles
2,Charlie,36,Chicago


In [43]:
print(df1.dtypes) # Display the data types of each column in the DataFrame

Name    object
Age      int64
City    object
dtype: object


In [44]:
df1.describe() # Generate descriptive statistics of the DataFrame

Unnamed: 0,Age
count,2.0
mean,33.5
std,3.535534
min,31.0
25%,32.25
50%,33.5
75%,34.75
max,36.0
