In [None]:
# Dealing with Multiple Files

import glob

student_files = glob.glob('exams*.csv')
df_list = []

for filename in student_files:
  data = pd.read_csv(filename)
  df_list.append(data)
 
students = pd.concat(df_list)

In [None]:
# Reshaping your Data

#print(students.columns)
students = pd.melt(frame=students, id_vars=['full_name','gender_age','grade'], value_vars=['fractions', 'probability'], value_name='score', var_name='exam')

In [None]:
# Dealing with Duplicates

duplicates = students.duplicated()
#print(duplicates.value_counts())

# Or specify a specific column to look for duplicate values
duplicates = df.duplicated(subset=['COLUMN'])

students = students.drop_duplicates()

In [None]:
# Splitting by Index

# Say we have a column “birthday” with data formatted in MMDDYYYY format. In other words, “11011993” represents a birthday of November 1, 1993. We want to split this data into day, month, and year so that we can use these columns as separate features
# Create the 'month' column
df['month'] = df.birthday.str[0:2]
 # Create the 'day' column
df['day'] = df.birthday.str[2:4]
# Create the 'year' column
df['year'] = df.birthday.str[4:]

In [None]:
# Splitting by Character

name_split = students.full_name.str.split(' ')
students['first_name'] = name_split.str.get(0)
students['last_name'] = name_split.str.get(1)

In [None]:
# Looking at Types

print(df.dtypes)

In [None]:
# String Parsing

# First, we can use what we know of regex to get rid of all of the percent signs:
students.score = students['score'].replace('[\%,]', '', regex=True)
# Then, we can use the pandas function .to_numeric() to convert strings containing numerical values to integers or floats:
students.score = pd.to_numeric(students.score)

# More String Parsing
# To extract the numbers from the string we can use pandas’ .str.split() function:
split_df = df['exerciseDescription'].str.split('(\d+)', expand=True)
-----
# We know that the number is in the first part of the string, so we can just get the first item when we split by digit:
students.grade = students.grade.str.split('(\d+)', expand=True)[1]
students.grade = pd.to_numeric(students.grade)

# Then, we can assign columns from this DataFrame to the original df:
df.reps = pd.to_numeric(split_df[1])
df.exercise = split_df[2].replace('[\- ]', '', regex=True)

In [None]:
# Missing Values

# Method 1: drop all of the rows with a missing value
# We can use .dropna() to do this:
df = df.dropna()
df = df.dropna(subset=['COLUMN'])

# Method 2: fill the missing values with the mean of the column, or with some other aggregate value.
# We can use .fillna() to do this:
df = df.column.fillna(0)
df.column = df.column.fillna(0)
df = df.fillna(value={"column1":df.column1.mean(), "columns2":df.column2.mean()})
# Column 1 is the column to be filled with the missing values
# we can fill those missing values with an aggregate value from Column 2