In [1]:
import pandas as pd

In [2]:
#https://github.com/colabLearn/PandasDFrameVideoScripts/blob/main/testData/Student_performance_data.csv
gitRepo = "https://github.com/colabLearn/PandasDFrameVideoScripts/raw/main/testData/Student_performance_data.csv"

In [3]:
studentData = pd.read_csv(gitRepo, dtype_backend="pyarrow", engine="pyarrow")

In [4]:
studentData

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.27915,4.0
2389,3390,16,1,0,2,6.8055,20,0,2,0,0,0,1,1.142333,2.0
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0


In [5]:
#One thing we may want to look at regarding the ]
#overall performance of the students is the mean GPA
# Let's extract the studentGPA from the dataframe
#into a Series
studentGPA = studentData['GPA']

### Single aggregation methods

In [6]:
#we can compute the mean GPA
#using this aggregation method of the series
studentGPA.mean()

1.9061863027265407

However, mean can be sensitive to outliers
for example if you have this kind of data
[22,24,26,31,99]

In [7]:
testData = pd.Series([22,24,26,31,99])

In [8]:
testData.mean()

40.4

In [9]:
#This value does not truly 
#represent typical value of
#data in this data set

In [10]:
#In this case median could be 
#a better measure to indicate
#typical values in the data
testData.median()

26.0

In [11]:
#skew()
studentGPA.astype('double').skew()

0.014525601346976186

- 0: Perfectly symmetric distribution.
- Positive value: Right-skewed (long tail on the right side).
- Negative value: Left-skewed (long tail on the left side).
In practical terms:
- Skewness between -0.5 and 0.5: Indicates a fairly symmetric distribution.
- Skewness between -1 and -0.5 or 0.5 and 1: Indicates moderate skewness.
- Skewness less than -1 or greater than 1: Indicates significant skewness.

In [12]:
print(studentGPA.mean(), studentGPA.median())

1.9061863027265407 1.8933926937866847


In [13]:
#Mean and median will be very close if data is symmetrical

In [14]:
#There are many other aggregation methods like this that we call on a series object
#Such as:
studentGPA.min() #Minmum value in the data

0.0

In [15]:
studentGPA.max()

4.0

In [16]:
studentGPA.var()

0.8375101754746491

In [17]:
studentGPA.std()

0.9151558203249592

#### Mention the .agg()

Another useful method if the .agg method
- obviously from its name it aggregates data, like quantile,
- it transform data depending on the argument you use to call the method
- you can call .agg method to computer the mean of data like this:

In [18]:
studentGPA.agg('mean')

1.9061863027265407

but of course the studentGPA.mean() method is suffice 
to compute mean. Where .agg method provide a better use
is in this example:

##### Key Advantages of Using .agg():
- Combine Multiple Aggregation Functions: .agg() allows you to apply multiple aggregation functions (e.g., mean, sum, min, max) simultaneously on a Series or DataFrame, enabling comprehensive summary statistics in a single step.
- Custom Aggregation Functions: You can pass user-defined functions to .agg(), allowing for customized aggregation logic beyond the built-in methods.

In [19]:
studentGPA.agg(['mean', 'var', 'max', 'min'])

mean    1.906186
var     0.837510
max     4.000000
min     0.000000
Name: GPA, dtype: float64

In [20]:
#Let's assume we make the pass GPA =1.5
#So that all students with GPT>=1.5 pass, otherwise fail
studentGPA.ge(1.5)

0        True
1        True
2       False
3        True
4       False
        ...  
2387     True
2388     True
2389    False
2390     True
2391     True
Name: GPA, Length: 2392, dtype: bool[pyarrow]

In [21]:
studentGPA.ge(1.5).sum()

1544

In [22]:
def no_student_score_gt_mean_gpa(s):
    return s.gt(1.5).sum()

In [23]:
studentGPA.agg(['mean', 'var', 'max', 'min', no_student_score_gt_mean_gpa])

mean                               1.906186
var                                0.837510
max                                4.000000
min                                0.000000
no_student_score_gt_mean_gpa    1544.000000
Name: GPA, dtype: float64

In [24]:
#An approach to get the % of these student that pass
(studentGPA.ge(1.5).sum()/studentGPA.count())*100

64.54849498327759

In [25]:
studentGPA.describe()

count      2392.0
mean     1.906186
std      0.915156
min           0.0
25%      1.174803
50%      1.893393
75%      2.622216
max           4.0
Name: GPA, dtype: double[pyarrow]

In [26]:
studentGPA.quantile()

1.8933926937866847

In [27]:
studentGPA.quantile(0.25)

1.1748034468836228

In [28]:
studentGPA.quantile(0.75)

2.622216170450785

In [29]:
studentGPA.quantile([0.25, 0.5, 0.75])

0.25    1.174803
0.50    1.893393
0.75    2.622216
Name: GPA, dtype: double[pyarrow]

In [30]:
studentGPA.is_unique

False

In [31]:
studentID = studentData['StudentID']

In [32]:
studentID.is_unique

True

In [33]:
#testSeries = pd.Series([1, 2, 3, 3, 4, 6, 7, 7, 9])  #-->This first
testSeries = pd.Series([1, 2, 3, 3, 5, 4, 6, 7, 7, 9])
#testSeries = pd.Series([ ])

In [34]:
#We may want to write a function like this;
def is_increasing(in_series):
    for i in range(len(in_series)-1):
        if(in_series[i]>in_series[i+1]):
            return False
    return True   

In [35]:
len(testSeries)

10

In [36]:
#If we call the quatile method without any arugment 
#Like this --> the output is the 50% quantile
#That is, the median of our dataset
studentGPA.quantile()

1.8933926937866847

This output indicates that:
- 50% of the students score below 1.89GPA
- And, 50% score above it

In [37]:
#25% quantile
studentGPA.quantile(0.25)

1.1748034468836228

In [38]:
studentGPA.quantile([.25,.5, .75])

0.25    1.174803
0.50    1.893393
0.75    2.622216
Name: GPA, dtype: double[pyarrow]

This output indicate that:
- 25% of the students score below: 1.17
- 50% score below 1.89
- 75% score below 2.62 GPA

- 90% of the students score below:3.13 GPA

In [39]:
#You can as well specify other levels of quantiles such as:
studentGPA.quantile(0.9)

3.132489956156879

In [40]:
#bool[pyarrow] datatype will not support this
studentGPA.ge(1.5).astype('bool').mul(100).mean()

64.54849498327759

In [41]:
percentile_25_mask = studentGPA.lt(studentGPA.quantile(0.25))

In [42]:
percentile_75_mask = studentGPA.gt(studentGPA.quantile(0.75))

In [43]:
studyTime_25 = studentData['StudyTimeWeekly'][percentile_25_mask]
studyTime_75 = studentData['StudyTimeWeekly'][percentile_75_mask]

In [44]:
studyTime_25

2         4.21057
18      16.254658
21      15.323142
22       18.64888
32       7.663289
          ...    
2370     2.912575
2373    19.088954
2375     18.92529
2378    12.905555
2389       6.8055
Name: StudyTimeWeekly, Length: 598, dtype: double[pyarrow]

In [45]:
studyTime_75

0       19.833723
1       15.408756
5        8.191219
6        15.60168
8        4.562008
          ...    
2376     14.66723
2377    12.749976
2381    10.095086
2387    10.680555
2388     7.583217
Name: StudyTimeWeekly, Length: 598, dtype: double[pyarrow]

This kind of looks neater 

In [46]:
from scipy import stats
t_stat, p_value = stats.ttest_rel(studyTime_25,studyTime_75)
print("T-Statistic: ", t_stat)
print("P-Value: ", p_value)

#Interprt the result
alpha  = 0.05
if p_value < alpha:
    print("The difference is statistically significant.")
else:
    print("The difference is not statistically significant.")

T-Statistic:  -7.901126749992717
P-Value:  1.3362897863540234e-14
The difference is statistically significant.


In [47]:
print(is_increasing(testSeries))

False


In [48]:
testSeries.is_monotonic_increasing

False

In [49]:
def significant_diff(series1, series2):
    t_stat, p_value = stats.ttest_rel(series1, series2)
    result = {"T-Statistic":t_stat, "P-Value":p_value}
    
    #Interprt the result
    alpha  = 0.05
    if p_value < alpha:
        result['interpret']="The difference is statistically significant."
    else:
        result['interpret'] = "The difference is not statistically significant."  

    return pd.Series(result)

In [50]:
significant_diff(studyTime_25, studyTime_75)

T-Statistic                                       -7.901127
P-Value                                                 0.0
interpret      The difference is statistically significant.
dtype: object

In [51]:
#We have 49.62% of the student that study more thatn 9.77hours per week

In [52]:
studentGPA.describe()

count      2392.0
mean     1.906186
std      0.915156
min           0.0
25%      1.174803
50%      1.893393
75%      2.622216
max           4.0
Name: GPA, dtype: double[pyarrow]

Common Aggregation Functions:
- 'mean': Mean of the values.
- 'sum': Sum of the values.
- 'min': Minimum value.
- 'max': Maximum value.
- 'median': Median value.
- 'var': Variance.
- 'std': Standard deviation.
- 'count': Count of non-null values.
- 'prod': Product of the values.
- 'first': First value.
- 'last': Last value.

Statistical Aggregations:
- series.mean(): Returns the average (mean) of the values in the Series.
- series.sum(): Computes the sum of all values in the Series.
- series.min(): Returns the smallest value in the Series.
- series.max(): Returns the largest value in the Series.
- series.median(): Calculates the median (middle value) of the Series.
- series.var():  Computes the variance of the values, which measures the spread of the values around the mean.
- series.std(): Calculates the standard deviation, which is the square root of the variance.
- series.prod(): Returns the product of all values in the Series.
- series.cumsum(): Computes the cumulative sum of the values in the Series.
- series.cumprod(): Computes the cumulative product of the values in the Series.
Count and Frequency:
- series.count(): Counts the number of non-NA/null entries in the Series.
- series.value_counts(): Description: Returns a Series with counts of unique values.
- series.nunique(): Description: Returns the number of unique values in the Series.
Aggregation with agg():
- series.agg(): Allows applying multiple aggregation functions to the Series at once. The output depends on the functions used (e.g., ['mean', 'sum']).
Custom Aggregations:
- Custom functions passed to agg():  You can pass custom functions to perform specific aggregations. The output varies based on the custom function provided. For example, lambda x: x.sum() / len(x) calculates the mean.


### Properties for Series
Descriptive Statistics:
- series. describe(): Provides a summary of statistics including count, mean, standard deviation, min, 25th percentile, median (50th percentile), 75th percentile, and max.
- series.is_unique: Checks if all values in the Series are unique. Output is a boolean (True or False).
- series.is_monotonic:Checks if the Series is either monotonically increasing or decreasing. Output is a boolean (True or False).
- series.is_monotonic_increasing: Checks if the Series is monotonically increasing. Output is  boolean (True or False).
- series.is_monotonic_decreasing: Checks if the Series is monotonically decreasing. Output is boolean (True or False).

Duplicates and Unique Values:
- series.duplicated(): Returns a boolean Series indicating whether each value is a duplicate of an earlier value.
- series.unique(): Returns an array of unique values.
- series.drop_duplicates(): Returns a Series with duplicate values removed. The output maintains the original order of the first occurrence.
Additional Properties:
- series.first(): (For GroupBy objects, not directly for Series) Returns the first value in each group.
