In [1]:
import numpy as np
import pandas as pd

In [2]:
# Reading a CSV
df = pd.read_csv('salaries.csv')
print(df)

     Name  Salary  Age
0    John   50000   34
1   Sally  120000   45
2  Alyssa   80000   27


In [3]:
# Selecting columns:
print('')
print('Selecting columns')
print(df['Salary'])
# notice the index positions are numbered for us



Selecting columns
0     50000
1    120000
2     80000
Name: Salary, dtype: int64


In [4]:
# Grabbing multiple columns of data:
print('')
print('Grabbing multiple columns')
print(df[['Name', 'Salary']])  # We must pass it as a list of name columns to the data frame, that's why the double
# square brackets


Grabbing multiple columns
     Name  Salary
0    John   50000
1   Sally  120000
2  Alyssa   80000


In [5]:
# Similarly to Numpy, we can grab min, max, mean values from a column
print('')
print('Grabbing min, max and mean values from a column')
print('Min Salary: ', df['Salary'].min())
print('Max Salary: ', df['Salary'].max())
print('Mean Salary: ', df['Salary'].mean())


Grabbing min, max and mean values from a column
Min Salary:  50000
Max Salary:  120000
Mean Salary:  83333.33333333333


In [6]:
# Conditional filtering
print('')
print('Conditional filtering')
ser_of_bool = df['Age'] > 30
print(ser_of_bool)


Conditional filtering
0     True
1     True
2    False
Name: Age, dtype: bool


In [8]:
# Passing them to the data frame allows us to filter:
print(df[ser_of_bool])


    Name  Salary  Age
0   John   50000   34
1  Sally  120000   45


In [9]:
# Exact same thing and most common use:
print(df[df['Age'] > 30])
# Inside our data frame we are asking which columns contain registries with an age > 30 and printing those results


    Name  Salary  Age
0   John   50000   34
1  Sally  120000   45


In [10]:
# Useful methods:

In [11]:
# Grabing unique values:
print('Grabing unique values: ')
print(df['Age'].unique())

Grabing unique values: 
[34 45 27]


In [12]:
# And we can use nunique to know the lenght of that list
print(df['Age'].nunique())

3


In [13]:
# Listing the columns
print(df.columns)  # notice as this is an attribute of the df and not a method, so it doesnt require '()'


Index(['Name', 'Salary', 'Age'], dtype='object')


In [14]:
# info() reports back information of our df
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Salary  3 non-null      int64 
 2   Age     3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes
None


In [15]:
# describe() will give us a statistical summary:
print(df.describe())

              Salary        Age
count       3.000000   3.000000
mean    83333.333333  35.333333
std     35118.845843   9.073772
min     50000.000000  27.000000
25%     65000.000000  30.500000
50%     80000.000000  34.000000
75%    100000.000000  39.500000
max    120000.000000  45.000000


In [16]:
# index -> creates an automatic range index of the df
print(df.index)

RangeIndex(start=0, stop=3, step=1)


In [17]:
# MIXING numpy and Pandas

In [18]:
mat = np.arange(0, 50).reshape(5, 10)
print(mat) # this is a numpy matrix

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47 48 49]]


In [19]:
# We can convert it to a pandas df:
new_df = pd.DataFrame(data=mat)
print(new_df) # Notice Pandas automatically labels the column names and the index (rows)


    0   1   2   3   4   5   6   7   8   9
0   0   1   2   3   4   5   6   7   8   9
1  10  11  12  13  14  15  16  17  18  19
2  20  21  22  23  24  25  26  27  28  29
3  30  31  32  33  34  35  36  37  38  39
4  40  41  42  43  44  45  46  47  48  49


In [20]:
# Let's give it another try, providing our own columns name and index:
new_mat_2 = np.arange(0, 10).reshape(5, 2)
new_df_2 = pd.DataFrame(data=new_mat_2, columns=['A', 'B'], index=['Peter', 'Maria', 'Nick', 'Tomas', 'Jessie'])
print(new_df_2)

        A  B
Peter   0  1
Maria   2  3
Nick    4  5
Tomas   6  7
Jessie  8  9
