## Numpy
A popular python library for performing mathematical operations on datatypes like arrays and vectors.  

In [11]:
import numpy as np #importing numpy with np alias

In [4]:
a = np.arange(36).reshape((6,6)) # create a matrix with 36 elements using arange and reshape according to factors 
a

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

#### Slicing Operations

In [10]:
a[2, 2:5]

array([14, 15, 16])

In [8]:
a[:3,3]

array([ 3,  9, 15])

## Pandas

In [None]:
import pandas as pd 

In [21]:
p = pd.Series([2,4])
print(p, "\nDimensions: ", p.ndim)

0    2
1    4
dtype: int64 
Dimensions:  1


In [19]:
p = pd.Series([2,4,6,8,10.5], index=["a","b","c","d","e"])
p

a     2.0
b     4.0
c     6.0
d     8.0
e    10.5
dtype: float64

In [20]:
a.ndim

2

### Conversion from List, Dicts and Numpy Arrs to Dataframes

In [32]:
alist = [2,4,6,8,10.5]
adict = {"a": 12, "b": 23,"c": 23, "d": 57}
bdict = {"a": [12], "b": [23],"c": [23], "d": [57]}
anump = np.arange(10) 

In [24]:
# List to DF
df = pd.DataFrame(alist)
df

Unnamed: 0,0
0,2.0
1,4.0
2,6.0
3,8.0
4,10.5


In [34]:
# Dictionary to DF
df = pd.DataFrame(adict,  index=[0])
df2 = pd.DataFrame(bdict)
df

Unnamed: 0,a,b,c,d
0,12,23,23,57


In [35]:
df2

Unnamed: 0,a,b,c,d
0,12,23,23,57


In [27]:
# Numpy Array to DF
df = pd.DataFrame(anump)
df

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


#### Example

In [61]:
college = {"student": ['Bubu', 'Sara', 'Coma', 'Disha'], "event": ['Sports', 'Badminton', 'Hockey', 'Bowling'],"score": [1,3,4,6]}
df2 = pd.DataFrame(college)
df2

Unnamed: 0,student,event,score
0,Bubu,Sports,1
1,Sara,Badminton,3
2,Coma,Hockey,4
3,Disha,Bowling,6


In [62]:
df2.columns

Index(['student', 'event', 'score'], dtype='object')

In [63]:
print("Coma's score: ",df2['score'][2])

Coma's score:  4


In [64]:
df2[df2['score'] > 5]

Unnamed: 0,student,event,score
3,Disha,Bowling,6


In [65]:
df2['Status'] = ["Fail",  "Fail", "Pass", "Pass"]
df2

Unnamed: 0,student,event,score,Status
0,Bubu,Sports,1,Fail
1,Sara,Badminton,3,Fail
2,Coma,Hockey,4,Pass
3,Disha,Bowling,6,Pass


In [66]:
df2['student'][2] = "Chroma" # changed a value
df2 # the warning can be ignored

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['student'][2] = "Chroma"


Unnamed: 0,student,event,score,Status
0,Bubu,Sports,1,Fail
1,Sara,Badminton,3,Fail
2,Chroma,Hockey,4,Pass
3,Disha,Bowling,6,Pass


In [67]:
# we can use del keyword with and without parenthesis 
# del df2['event']
del(df2['event']) # deleted a column
df2

Unnamed: 0,student,score,Status
0,Bubu,1,Fail
1,Sara,3,Fail
2,Chroma,4,Pass
3,Disha,6,Pass


In [79]:
df2.loc[3][:2] # accessing rows using loc method

student    Disha
score          6
Name: 3, dtype: object

### Reading dataset from filesystem
We use the read_csv() method for reading csv from localstorage and storing them as an object in python. Other file formats can also be read such as read_excel, read_json, read_sql.

In [5]:
dataset = pd.read_csv('../mobile_price_data.csv') 
dataset.head() # prints first 5 or specified number of rows from the dataset 

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


In [None]:
dataset.describe() # summary of numerical values in the dataset

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,1248.51,0.516,1.5409,0.517,4.593,0.487,33.652,0.5175,139.511,...,10.054,627.121,1239.774,2138.998,11.995,5.316,11.085,0.756,0.5,0.507
std,288.819436,432.458227,0.499994,0.829268,0.499961,4.463325,0.500081,18.128694,0.280861,34.85155,...,6.095099,432.929699,439.670981,1088.092278,4.320607,4.240062,5.497636,0.429708,0.50025,0.500201
min,1.0,500.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,...,0.0,0.0,501.0,263.0,5.0,0.0,2.0,0.0,0.0,0.0
25%,250.75,895.0,0.0,0.7,0.0,1.0,0.0,18.0,0.3,109.75,...,5.0,263.75,831.75,1237.25,8.0,2.0,6.75,1.0,0.0,0.0
50%,500.5,1246.5,1.0,1.5,1.0,3.0,0.0,34.5,0.5,139.0,...,10.0,564.5,1250.0,2153.5,12.0,5.0,11.0,1.0,0.5,1.0
75%,750.25,1629.25,1.0,2.3,1.0,7.0,1.0,49.0,0.8,170.0,...,16.0,903.0,1637.75,3065.5,16.0,8.0,16.0,1.0,1.0,1.0
max,1000.0,1999.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,...,20.0,1907.0,1998.0,3989.0,19.0,18.0,20.0,1.0,1.0,1.0


In [6]:
dataset.columns

Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
       'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')

In [7]:
dataset.size

21000

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   int64  
 1   battery_power  1000 non-null   int64  
 2   blue           1000 non-null   int64  
 3   clock_speed    1000 non-null   float64
 4   dual_sim       1000 non-null   int64  
 5   fc             1000 non-null   int64  
 6   four_g         1000 non-null   int64  
 7   int_memory     1000 non-null   int64  
 8   m_dep          1000 non-null   float64
 9   mobile_wt      1000 non-null   int64  
 10  n_cores        1000 non-null   int64  
 11  pc             1000 non-null   int64  
 12  px_height      1000 non-null   int64  
 13  px_width       1000 non-null   int64  
 14  ram            1000 non-null   int64  
 15  sc_h           1000 non-null   int64  
 16  sc_w           1000 non-null   int64  
 17  talk_time      1000 non-null   int64  
 18  three_g  

In [19]:
dataset['id'][0] = 69
dataset.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['id'][0] = 69


Unnamed: 0,id,battery_power,blue,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,69,1043,1,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,1,4,1,61,0.8,191,5,12,746,857,3895,6,0,7,1,0,0


In [16]:
del(dataset['clock_speed'])

In [17]:
dataset.columns

Index(['id', 'battery_power', 'blue', 'dual_sim', 'fc', 'four_g', 'int_memory',
       'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram',
       'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi'],
      dtype='object')

In [21]:
dataset['touch_screen'].value_counts()

1    500
0    500
Name: touch_screen, dtype: int64

In [33]:
# iloc method 
# data_object.iloc[rows, columns]
dataset.iloc[:1,2:]

Unnamed: 0,blue,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0


In [34]:
dataset.loc[:1,]

Unnamed: 0,id,battery_power,blue,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,69,1043,1,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,1,4,1,61,0.8,191,5,12,746,857,3895,6,0,7,1,0,0
