# Pandas Exploration

## Imports

In [3]:
import pandas as pd

import numpy as np                # for manipulation of examples
#import cprint

## DataFrame Setup

In [None]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])                     # another way to do this is with a dictionary - write this down here too  
display(df)

## Mastery of Indexing
AWLAYS "RC" - Rows, Columns

Axis 0 = Rows

Axis 1 = Columns

The main thing here, is that pandas and numpy indexing IS very similar, really, the only difference is that pandas just requires the extra ".loc", whereas numpy does not

In [None]:
# common indexing command
df.iloc[:,:10]
df = df.iloc[:,:10]
# when indexing, the # here, is exactly the number of columns I want in the final dataset

# if I want to only keep rows that fit a specific "threshold"
# This syntax returns a dataframe full of true/false
df = (df.iloc[:,:10] >= -.2) & (df.iloc[:,:10] <= .2)  

# these two syntaxes both return the original values which meet the condition, 
# because its literally indexing in the original df[]
# df.where() actually is the exact same function as df[]
df = df[(df.iloc[:,:10] >= -.2) & (df.iloc[:,:10] <= .2)]
df = df.where((df.iloc[:,:10] >= -.2) & (df.iloc[:,:10] <= .2))


## Mastery of "filtering" by values
So when it comes to filtering, although this feels inefficient... it appears that this must be done by individually popping out the columns in need of filtering, running a "filter" to only keep values in a certain range, and then rejoining these columns back to the original dataframe. I have searched far and wide for a way to do this more efficiently, even written a SO question on this, but at the moment this appears to be the only way to do this.

In [None]:
# filtering values in a target variable, and all feature columns
def return_viable_samples(df):
  # for each y value, verify if they lie in range [25-350], and drop the row if not
    df_y = df.pop('labels')
    df_y = df_y[(df_y >= 25) & (df_y <= 350)]
    df_y.dropna(axis=0, inplace=True)
    df_y = (df_y >= 140).astype(int)

  # for each feature verify if they lie in the range [-0.2,0.2], and drop the row if not
    df = df[(df.iloc[:,:10] >= -.2) & (df.iloc[:,:10] <= .2)]
    df.dropna(axis=0, inplace=True)
    df['labels'] = df_y

    return df

## Simple dataframe things

In [4]:
display(df)

print(f'rounding all values in a dataframe: \n{df.round(-1)}\n')

print(f'unique values in "a": {pd.unique(df["a"])}\n')

print(f'number of unique values in a: {len(pd.unique(df["a"]))} \n')

#cprint(f'number of unique values in all columns: \n{df.nunique(axis=0)} \n', 'blue')

print(f'count of each unique value column "a": \n{df["a"].value_counts()} \n')

print(f'percentage of each unique value in colunn "a": \n{df["a"].value_counts(normalize=True)} \n')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


rounding all values in a dataframe: 
    a   b   c
0   0   0   0
1   0   0  10
2  10  10  10

unique values in "a": [1 4 7]

number of unique values in a: 3 

count of each unique value column "a": 
7    1
1    1
4    1
Name: a, dtype: int64 

percentage of each unique value in colunn "a": 
7    0.333333
1    0.333333
4    0.333333
Name: a, dtype: float64 



### concatinating dataframes ---- this function can do all the types of "joins/appends" needed

In [None]:
pd.concat([df_1, df_2])            # when no axis is specified, this is an "append" statement

pd.concat([df_1, df_2], axis=1)    # when axis = 1, this is a "join" statement, an "outer join" by default