In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/student_depression_dataset.csv", sep=",")

In [3]:
df.head(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0


In [4]:
df["Age_double"] = df["Age"] * 2

In [5]:
df.head(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_double
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1,66.0
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0,48.0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0,62.0


#### NaN = Not a Number (inconsistência matemática, exemplo: 0 dividido por 0)

In [6]:
df["Escalar"] = 1

In [7]:
df.head(2)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_double,Escalar
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1,66.0,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0,48.0,1


### Usando Numpy para cálculos matemáticos

In [10]:
df["Age_log"] = np.log(df["Age"]) # Aplica o log na coluna inteira

In [11]:
df["Age_log"].head(3)

0    3.496508
1    3.178054
2    3.433987
Name: Age_log, dtype: float64

#### Como deixar a coluna Gender em maiúsculo

In [15]:
#Forma sequencial:
Gender_m = []
for i in df["Gender"]:
    Gender_m.append(i.upper())

df["Gender_m"] = Gender_m

In [16]:
df["Gender_m"].head(3)

0      MALE
1    FEMALE
2      MALE
Name: Gender_m, dtype: object

In [None]:
#Forma vetorial (melhor) - tudo de uma vez só:
df["Gender_m"] = df["Gender"].str.upper() #Supõe que cada elemento é uma string e aplica o método upper. Se a variável não for string, vai dar um erro.


In [18]:
df["Gender_m"].head(3)

0      MALE
1    FEMALE
2      MALE
Name: Gender_m, dtype: object

### Usando função lambda

In [None]:
# Usado quando você quer definir uma função para usar uma única vez em seguida
df["Profession"].apply(lambda x: x.split("_")[0])

0        Student
1        Student
2        Student
3        Student
4        Student
          ...   
27896    Student
27897    Student
27898    Student
27899    Student
27900    Student
Name: Profession, Length: 27901, dtype: object

### Definindo e aplicando funções

In [20]:
def intervalo_idades(idade):
    if idade < 25:
        return "Jovem"
    elif idade >= 25 or idade < 60:
        return "Adulto"
    else:
        return "Idoso"

In [21]:
df["Age_interval"] = df["Age"].apply(intervalo_idades)

In [22]:
df2 = df[df["Age"] < 30]

In [23]:
df2.head(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_interval
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0,Jovem
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1,Adulto
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0,Adulto


### Usando o apply agora em um dataframe

Combinando colunas

In [31]:
df.columns

Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'Age_interval'],
      dtype='object')

In [28]:
def soma_pontos(df_t):
    pontos = 0
    if df_t["Age"] < 20:
        pontos = pontos + 1
    elif df_t["Age"] >= 20 or df_t["Age"] < 30:
        pontos = pontos + 2
    elif df_t["Age"] >= 30:
        pontos = pontos + 3

    if df_t["CGPA"] < 5:
        pontos = pontos + 1
    elif df_t["CGPA"] >=5 or df_t["CGPA"] < 6:
        pontos = pontos + 2
    elif df_t["CGPA"] >= 6:
        pontos = pontos + 3
    
    return pontos


In [32]:
df["pontos"] = df.apply(soma_pontos, axis=1)

In [33]:
df.head(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_interval,pontos
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1,Adulto,4
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0,Jovem,4
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0,Adulto,4
