In [42]:
!pip install pandas==2.2.3
!pip install numpy==2.1.2
!pip install requests==2.32.3
!pip install python-dateutil==2.9.0.post0
!pip install scikit-learn==1.5.2
!pip install scipy==1.14.1
!pip install tqdm==4.66.5
!pip install urllib3==2.2.3
!pip install joblib==1.4.2
!pip install jupyterlab==4.2.5
!pip install matplotlib==3.9.2
!pip install seaborn==0.13.2
!pip install json5==0.9.25
!pip install statsmodels==0.14.4
!pip install tqdm==4.66.5
!pip install types-python-dateutil==2.9.0.20241003
!pip install urllib3==2.2.3



In [43]:
import requests
import joblib
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pandas import option_context
from tqdm import tqdm
from statsmodels.stats.inter_rater import aggregate_raters
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mannwhitneyu,shapiro,ttest_ind

import cliffs_delta as cd
import GenerateActivities as gat
import important_features as imf

In [44]:
QUERY_ROOT = "https://api.github.com"
TOKEN = '' # write your GitHub API key here
HEADERS={'Authorization':'token '+TOKEN}

## Section 3

### Question 1

#### Write an automated Python script to identify all GitHub bot actors in the list of contributors, by querying the GitHub REST API users endpoint and extracting the required information to make this decision.

#### More information: You can search for the "type" key in the obtianed JSON result, If it is "Bot", then it is a bot actor, if it is "User", then it is an account.

In [45]:
df = pd.read_csv('sample10.csv')
if len(df.columns) != 2:
    raise ValueError("There should be 2 columns")
df

Unnamed: 0,contributor,bothunter_type
0,AtreyeeS,Human
1,EwoutH,Human
2,GPUtester,Human
3,JuliaTagBot,Bot
4,MarcSkovMadsen,Human
5,Sauravroy34,Human
6,WarrenWeckesser,Human
7,ammar-qazi,Human
8,awvwgk,Human
9,bioc-workshop-dev,Human


In [46]:

contributors = []
for c in df["contributor"]:
    contributors.append(c)

print(contributors)

['AtreyeeS', 'EwoutH', 'GPUtester', 'JuliaTagBot', 'MarcSkovMadsen', 'Sauravroy34', 'WarrenWeckesser', 'ammar-qazi', 'awvwgk', 'bioc-workshop-dev', 'braingram', 'codecov-commenter', 'codecov[bot]', 'codspeed-hq[bot]', 'conda-bot', 'crvernon', 'droumis', 'flying-sheep', 'github-actions[bot]', 'github-advanced-security[bot]', 'helske', 'jaimergp', 'jonathanfischer97', 'katrinabrock', 'likeajumprope', 'martin-frbg', 'mira-miracoli', 'natmokval', 'netlify[bot]', 'ottointhesky', 'ppxasjsm', 'pre-commit-ci[bot]', 'renovate[bot]', 'saimn', 'speth', 'stan-buildbot', 'tardis-bot', 'tkoyama010', 'wpbonelli', 'zeptodoctor']


In [47]:
app_dict = []

for contributor in contributors:
    query = f'{QUERY_ROOT}/users/{contributor}'
    response = requests.get(query, headers=HEADERS)
    json_response = response.json()
    is_bot = json_response["type"] == "Bot"
    app_dict.append({'contributor': contributor, "app": is_bot})

pd_app_dict = pd.DataFrame.from_dict(app_dict)
pd_app_dict

Unnamed: 0,contributor,app
0,AtreyeeS,False
1,EwoutH,False
2,GPUtester,False
3,JuliaTagBot,False
4,MarcSkovMadsen,False
5,Sauravroy34,False
6,WarrenWeckesser,False
7,ammar-qazi,False
8,awvwgk,False
9,bioc-workshop-dev,False


### Question 2

#### Add a new 'actor' column in the CSV file to store this information. You can write True if it is a bot actor and False if it is not.

#### Report on the total number of GitHub bot actors, and User accounts present in the list of accounts of your dataset.

In [48]:
contributors_pd = pd.merge(df, pd_app_dict, on='contributor')

bots = contributors_pd.loc[contributors_pd["app"] == True]
print("Nb of bots: " + str(len(bots)))
bots

Nb of bots: 7


Unnamed: 0,contributor,bothunter_type,app
12,codecov[bot],Bot,True
13,codspeed-hq[bot],Bot,True
18,github-actions[bot],Bot,True
19,github-advanced-security[bot],Bot,True
28,netlify[bot],Bot,True
31,pre-commit-ci[bot],Bot,True
32,renovate[bot],Bot,True


### Question 3

#### For each bot actor, write their purpose, the task that they automate by looking at their GitHub profile, homepage, information on the GitHub Marketplace, or other documentation that you can find online.


 - Codecov : Codecov provides highly integrated tools to group, merge, archive and compare coverage reports. Whether your team is comparing changes in a pull request or reviewing a single commit, Codecov will improve the code review workflow and quality. [website](https://about.codecov.io/)
 - CodSpeed : CodSpeed is a continuous benchmarking platform that allows you to track and compare the performance of your codebase during development. [website](https://codspeed.io/)
 - github-actions : Automate, customize, and execute your software development workflows right in your repository with GitHub Actions.
 - github-advanced-security : GitHub makes extra security features available to customers under an Advanced Security license.
 - netlify : Publish incredibly high performance websites and applications right from GitHub. The Netlify platform connects your repositories to an all-in-one workflow for global CDN deployment, continuous integration, and automatic (and free) HTTPS. [website](https://www.netlify.com/)
 - pre-commit-ci : a continuous integration service for the pre-commit framework. [website](https://github.com/marketplace/pre-commit-ci)
 - renovate : Dependency Automation service by Mend.io. [website](https://github.com/marketplace/renovate)


## Section 4

### Question 1

#### Write an automated script to use BIMBAS (Bot Identification Model Based on Activity Sequences) to obtain a prediction of the GitHub contributor type (Bot or Human) for the contributors present in your dataset.

#### Folow the steps given in each cell to use BIMBAS and obtain predictions.

### Read all the events and select the events performed by the contributors present in the given set

In [53]:
# 1. Read the csv file of events (all_events.csv) provided along with this notebook
# 2. Convert created_at column to datetime format
#    One possible way is to use lambda function: "events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))"
# 3. Get the contributors provided to you in a list or use the contributors list that you created in Section 3 Question 1.
#    e.g., ['contributor1', 'contributor2', 'contributor3',...]
# 4. Select the events performed by these contributors alone. Do not consider all the events. You should consider only the events
#    performed by the contributors provided to you. Hint: you can use the df.query function
# 5. Display the considered events.

df_events = pd.read_csv('all_events.csv')
df_events['created_at'] = pd.to_datetime(df_events['created_at'], unit='ms')
df_events = df_events[df_events['login'].isin(contributors)]
df_events['event_type']

Unnamed: 0,event_type
3,IssueCommentEvent
6,CommitCommentEvent
15,CommitCommentEvent
16,PushEvent
21,CreateEvent
...,...
358397,IssueCommentEvent
358398,IssueCommentEvent
358401,PushEvent
358403,PushEvent


### Execute BIMBAS to obtain the predictions

In [54]:
# Do not modify this cell
# Use the following function the code below to obtain your predictions

def execute_bimbas(contributors_list, selected_events):
    '''
    args: contributors_list (list) - list of contributors in the dataset provided to you
          selected_events (DataFrame) - DataFrame of events performed by the considered contributors
    return: bimbas_prediction (DataFrame) - DataFrame of preditions along with the confidence in prediction for each contributor
    '''
    result=pd.DataFrame()
    temp_list = []
    bimbas = joblib.load('bimbas.joblib')
    date_limit = pd.to_datetime(selected_events.created_at.max()) + pd.DateOffset(-90)

    for contributor in tqdm(contributors_list):
        # c = contributors[contributor]
        activities = gat.activity_identification(selected_events.query('login==@contributor and created_at>=@date_limit'))
        activity_features = (
                            imf.extract_features(activities)
                            .set_index([[contributor]])
                            )
        # features = pd.concat([features,activity_features])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            probability = bimbas.predict_proba(activity_features)[0][1]
        if(probability <= 0.5):
            contributor_type = 'Human'
        else:
            contributor_type = 'Bot'
        confidence = (abs(probability - 0.5)*2).round(3)

        pred = activity_features.set_index([[contributor]]).assign(
            prediction=contributor_type,
            confidence = confidence,
            )
        temp_list.extend([{'contributor':contributor,'bimbas_type':contributor_type,'confidence':confidence}])

    bimbas_prediction = pd.DataFrame.from_dict(temp_list)

    return(bimbas_prediction)

In [55]:
# Write your code here to call execute_bimbas function.
# Pass the contributors list and the filtered events DataFrame as argument to "execute_bimbas" function
# Prediction returned by BIMBAS will have "contributor", "bimbas_type" and "confidence"
# Note: Print the prediction provided by BIMBAS - else this cell will not be graded.

res = execute_bimbas(contributors, df_events)
res

  temp_df = df_events.query('event_type == "CreateEvent" and ref_type == "tag" and event_id not in @list_event_id_covered')
  temp_df = df_events.query('event_type == "IssuesEvent" and action == "opened" and event_id not in @list_event_id_covered')
  .query('event_type == "IssuesEvent" and action == "closed" and event_id not in @list_event_id_covered')
  temp_df = df_events.query('((event_type == "IssuesEvent" and action == "reopened") or event_type == "IssueCommentEvent") \
  .query('event_type == "IssuesEvent" and action == "reopened" and event_id not in @list_event_id_covered')
  temp_df = df_events.query('event_type == "IssueCommentEvent" and event_id not in @list_event_id_covered ')
  temp_df = df_events.query('((event_type == "PullRequestEvent" and action == "reopened") or event_type == "IssueCommentEvent") \
  .query('event_type == "PullRequestEvent" and action == "reopened" and event_id not in @list_event_id_covered')
  temp_df = df_events.query('((event_type == "PullRequestEve

Unnamed: 0,contributor,bimbas_type,confidence
0,AtreyeeS,Human,0.896
1,EwoutH,Human,0.835
2,GPUtester,Bot,0.841
3,JuliaTagBot,Bot,0.345
4,MarcSkovMadsen,Human,0.782
5,Sauravroy34,Human,0.83
6,WarrenWeckesser,Human,0.906
7,ammar-qazi,Human,0.72
8,awvwgk,Human,0.899
9,bioc-workshop-dev,Bot,0.887


### Create a column in your predictions DataFrame to have predictions provided by BIMBAS.
### Now your DataFrame should have the following columns - contributor, bothunter_type, app, bimbas_type, confidence

### Merging bothunter and bimbas predictions

In [58]:
# Merge the predictions provided by BIMBAS to the resultant dataset of Section 3. This final DataFrame should have the following columns -
# contributor, bothunter_type, user_type, bimbas_type, and confidence.
df_merge = pd.merge(res, contributors_pd, on='contributor')
df_merge

Unnamed: 0,contributor,bimbas_type,confidence,bothunter_type,app
0,AtreyeeS,Human,0.896,Human,False
1,EwoutH,Human,0.835,Human,False
2,GPUtester,Bot,0.841,Human,False
3,JuliaTagBot,Bot,0.345,Bot,False
4,MarcSkovMadsen,Human,0.782,Human,False
5,Sauravroy34,Human,0.83,Human,False
6,WarrenWeckesser,Human,0.906,Human,False
7,ammar-qazi,Human,0.72,Human,False
8,awvwgk,Human,0.899,Human,False
9,bioc-workshop-dev,Bot,0.887,Human,False


### Question 2
#### Using Cohen's Kappa compute and report the interrater agreement score between the labels computed by both the bot identification approaches. Mention your interpretation of Cohen's Kappa

In [61]:

cohen_kappa_score(df_merge['bothunter_type'], df_merge['bimbas_type'])
"""
bothunter_type and bimbas_type really disagree.
It's a failbe score.
"""


np.float64(0.4358974358974359)

### Question 3
#### Determine the final type of each contributor. Whenever user_type column has the value "User" check if both bimbas_type and bothunter_type give the same prediction, then consider it as your final prediction in "acc_type" column.
#### For the contributors that have different predictions, i.e. bimbas_type is not same as bothunter_type, then make a manual verification and consider that as the type for that contributor. Add an extra column to the DataFrame ﬁle called 'manual' that has the label determined by you.
#### For manual verification you can make use the GitHub UI and the GitHub API https://api.github.com/users/username

#### Whenever user_type column has the value "Bot", directly write your final acc_type as "Bot Actor"

#### More information:
For each contributor, you can look at their activities in GitHub UI, look at their latest events using GitHub Events API - https://api.github.com/users/<contributor>/events, to make a decision on their type.

In addition, you can also write a very small reason why do you think they are bot or human (e.g., same activity at regular interval so Bot, files committed and code modified looks like Human, comments look like Human, comment look like Bot, test report looks automated so Bot, and so on....). This will be useful to answer your next question.

In [75]:
df_app = df_merge[df_merge['app'] == True]
df_app

Unnamed: 0,contributor,bimbas_type,confidence,bothunter_type,app
12,codecov[bot],Bot,0.938,Bot,True
13,codspeed-hq[bot],Bot,0.916,Bot,True
18,github-actions[bot],Bot,0.145,Bot,True
19,github-advanced-security[bot],Bot,0.703,Bot,True
28,netlify[bot],Bot,0.925,Bot,True
31,pre-commit-ci[bot],Bot,0.936,Bot,True
32,renovate[bot],Bot,0.166,Bot,True


In [76]:
df_same = df_merge[df_merge['bimbas_type'] == df_merge['bothunter_type']]
df_same

Unnamed: 0,contributor,bimbas_type,confidence,bothunter_type,app
0,AtreyeeS,Human,0.896,Human,False
1,EwoutH,Human,0.835,Human,False
3,JuliaTagBot,Bot,0.345,Bot,False
4,MarcSkovMadsen,Human,0.782,Human,False
5,Sauravroy34,Human,0.83,Human,False
6,WarrenWeckesser,Human,0.906,Human,False
7,ammar-qazi,Human,0.72,Human,False
8,awvwgk,Human,0.899,Human,False
10,braingram,Human,0.889,Human,False
11,codecov-commenter,Bot,0.922,Bot,False


In [72]:
df_diff = df_merge[df_merge['bimbas_type'] != df_merge['bothunter_type']]
df_diff

Unnamed: 0,contributor,bimbas_type,confidence,bothunter_type,app
2,GPUtester,Bot,0.841,Human,False
9,bioc-workshop-dev,Bot,0.887,Human,False
14,conda-bot,Bot,0.841,Human,False
15,crvernon,Bot,0.566,Human,False
24,likeajumprope,Bot,0.866,Human,False
25,martin-frbg,Bot,0.864,Human,False
29,ottointhesky,Bot,0.802,Human,False
30,ppxasjsm,Bot,0.831,Human,False
35,stan-buildbot,Human,0.201,Bot,False
36,tardis-bot,Bot,0.331,Human,False


In [74]:
manual_list_dict = [{'contributor': 'GPUtester', 'manual': 'Bot'},
                    {'contributor': 'bioc-workshop-dev', 'manual': 'Human'},
                    {'contributor': 'conda-bot 	', 'manual': 'Bot'},
                    {'contributor': 'crvernon', 'manual': 'Human'},
                    {'contributor': 'likeajumprope', 'manual': 'Human'},
                    {'contributor': 'martin-frbg', 'manual': 'Human'},
                    {'contributor': 'ottointhesky', 'manual': 'Human'},
                    {'contributor': 'ppxasjsm', 'manual': 'Human'},
                    {'contributor': 'stan-buildbot', 'manual': 'Bot'},
                    {'contributor': 'tardis-bot 	', 'manual': 'Bot'},
                    {'contributor': 'zeptodoctor', 'manual': 'Human'}]
manual_list_dict = pd.DataFrame.from_dict(manual_list_dict)
manual_list_dict

Unnamed: 0,contributor,manual
0,GPUtester,Bot
1,bioc-workshop-dev,Human
2,conda-bot \t,Bot
3,crvernon,Human
4,likeajumprope,Human
5,martin-frbg,Human
6,ottointhesky,Human
7,ppxasjsm,Human
8,stan-buildbot,Bot
9,tardis-bot \t,Bot


In [None]:
# Write your code below here to determine the final type of contributor.
# add column named 'acc_type' and write your final decision in it. Final decision is majority of three types - bimbas_type, bothunter_type, manual.


### Question 4
#### Study and report the purpose of these identified bots and humans in the repository assigned to you.
#### For example, the purpose can be based on but not limited to -
##### 1) the type of activities that they are performing (releasing a version on every Sunday, updating the documentation),
##### 2) type of comments they are posting (reviewing code, test summary report),
##### 3) when are they triggered (when a new PR is created, when someone in the project ask the bot to merge the code) and so on.  
#### Note:
##### The purpose can be other than what is provided in examples above.

## Section 5

### Question 1
#### Use the filtered events file that has the events performed by the contributors provided to you
#### Group each event into the following four categories

Issues: IssueCommentEvent, IssuesEvent  
Pull Requests: PullRequestEvent, PullRequestReviewCommentEvent  
Commits: CommitCommentEvent, PushEvent  
Repository: CreateEvent, DeleteEvent, ForkEvent, GollumEvent, MemberEvent, PublicEvent, ReleaseEvent, SponsorshipEvent, WatchEvent  

#### Hint:
1. Add a column called event_group that mentions which event group does that event belong to. Each event (row) should correspond to an event group.  
2. Then perform groupby on ['login','event_group'],  
3. use .agg (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html) to count the number of events performed by each contributor in each group,  
4. use pivot with the required arguments (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html). An example is given [here](#pivot_example) in question 2(b), and
5. Reset index and rename axis with None
7. Finally merge it with your 'acc_type' field from the DataFrame you created in Section4, drop contributor, and fillna with 0
8. The final DataFrame should have the following columns - login, PR, commit, issue, repo, acc_type

The final DataFrame should be looking like the following  
![](https://github.com/fadri14/Software_Evolution_TP_bots/blob/main/event_group.png?raw=1)

In [None]:
# In the resultant DataFrame - each row should corespond to a contributor, and the columns should have all the event groups
# and the type of the contributor (that you decided in the previous DataFrame) and the values should be the number
# of events of that event event group the contriubutor has performed.

issue_group = ['IssueCommentEvent', 'IssuesEvent']
pr_group = ['PullRequestEvent', 'PullRequestReviewCommentEvent']
commit_group = ['CommitCommentEvent', 'PushEvent']
repo_group = ['CreateEvent', 'DeleteEvent', 'ForkEvent', 'GollumEvent', 'MemberEvent', 'PublicEvent', 'ReleaseEvent', 'SponsorshipEvent', 'WatchEvent']

In [None]:
# YOUR CODE BELOW


In [None]:
# YOUR CODE BELOW


### Question 2 (a)

#### Compute the median number of events per event group for Bot+Bot actors and Humans and write in DataFrame.

Row should correspond to type (Bot_BotActor and Human), Column should have Event group name and the values should be the median value of Bot_BotActor or Human for that particular event group. An example is given below

In [None]:
# For example:
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot_app
1,event_group1,val2,Human
2,event_group2,val3,Bot_app
3,event_group2,val4,Human
4,event_group3,val5,Bot_app
5,event_group3,val6,Human
6,event_group4,val7,Bot_app
7,event_group5,val8,Human


In [None]:
# YOUR CODE BELOW

### Question 2 (b)

Plot a heatmap of the DataFrame using seaborn -
1. First convert the dataframe to the required format using pivot, example is given below
2. plot using seaborn - sns.heatmap(df_medians, annot=True, vmin=0, vmax=300, cmap="crest"). More details: https://seaborn.pydata.org/generated/seaborn.heatmap.html)

#### pd.pivot example:
<a id='pivot_example'></a>

In [None]:
# Main DataFrame
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot'},
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot
1,event_group1,val2,Human
2,event_group2,val3,Bot
3,event_group2,val4,Human
4,event_group3,val5,Bot
5,event_group3,val6,Human
6,event_group4,val7,Bot
7,event_group5,val8,Human


In [None]:
# pivot the main DataFrame
df_medians.pivot(index='acc_type', columns='event_group', values='median')#.reset_index().rename_axis(None,axis=1)

event_group,event_group1,event_group2,event_group3,event_group4,event_group5
acc_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bot,val1,val3,val5,val7,
Human,val2,val4,val6,,val8


In [None]:
# YOUR CODE BELOW

In [None]:
# YOUR CODE BELOW


#### What is the difference that you observe between Bots+Bot actors and Humans?

#### What is the differnce that you observe between Event groups?

#### What is the difference that you observe between Bots+Bot actors and Humans and Event groups all considering at the same time?

### Question 3

#### Create boxen plots to visualise the distribution of number of events in each event group.
#### For more information you can visit - https://seaborn.pydata.org/generated/seaborn.boxenplot.html#seaborn.boxenplot
#### You should highlight the data points that correspond to bots using a stripplot in seaborn. https://seaborn.pydata.org/tutorial/categorical.html#categorical-tutorial  
#### Interpret the results of the visualisation.

In [None]:
# YOUR CODE BELOW - Visualize number of events from pull request event group


In [None]:
# YOUR CODE BELOW - Visualise number of events from issue event group


In [None]:
# YOUR CODE BELOW - Visualise number of events from commit event group


In [None]:
# YOUR CODE BELOW - Visualise number of events from repo event group


### Question 4.1

#### Statistical identify whether number of events in each event group is normally distributed or not.
#### Null hypothesis - $H_0$: Sample comes from the data that has normal distribution.
#### Use Shapiro-Wilk test for this purpose. Use the p-value with a threshold of 0.05 to determine whether $H_0$ can be rejected with statistical significance or not.

#### Use shapiro from scipy.stats to perform this test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html)

In [None]:
# YOUR CODE BELOW

### Question 4.2
#### Determine if there is any statistical diﬀerence in number of events between the identified event groups. Perform this test on all pairs of event groups.
#### Null hypothesis - $H_0$: Any two event group come from the same population.
#### If $H_0$ is rejected in Shapiro-Wilk test (at least for one of the two event groups considered for test), use the Mann-Whitney U statistical test for this purpose. If $H_0$ is not rejected in Shapiro-Wilk test, use the independent t-test for this purpose.  
#### In any case, use the p-value with a threshold of 0.01 to determine whether $H_0$ can be rejected with statistical significance.

#### Use mannwhitneyu from scipy.stats to perform Mann-Whitney U test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html)
#### or
#### ttest_ind from scipy.stats to perform independent t-test https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html.

#### More information:

You can pass the following arguments to mannwhitneyu from scipy.stats - (method='exact', nan_policy='omit'). For ttest_ind you can use - (nan_policy='omit')

In [None]:
# YOUR CODE BELOW

### Question 4.3
#### Each time you reject the null hypothesis $H_0$, quantify the effect size of the diﬀerence between the groups using cliﬀ’s delta ($\delta$).
#### To calculate cliﬀ’s delta, you can pass the list of values to cliﬀ delta.py file given in the repository. E.g., cliffsDelta.cliffsDelta(list of values, list of values). This will return the effect size.
#### Refer to the table given in the TP document and mention your interpretation (negligible, small, medium, large).

In [None]:
# YOU CODE BELOW