# Análisis de código fuente de repositorios (líneas de código)
Este análisis netamente considera las líneas de código (LOC) de cada repositorio.

## Cargar dependencias

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Cargar archivo CSV de entrada

In [2]:
# generamos DF a partir de CSV de entrada
input_file = '../csvs/cloc_output.csv'
raw_df = pd.read_csv(input_file)
raw_df

Unnamed: 0,project_id,language,files,blank,comment,code
0,0xEduardo_nodeless,JSON,2,0,0,1493
1,0xEduardo_nodeless,Markdown,1,41,0,75
2,0xEduardo_nodeless,JavaScript,1,7,0,34
3,0xEduardo_nodeless,YAML,1,5,0,32
4,0xEduardo_nodeless,SUM,5,53,0,1634
...,...,...,...,...,...,...
5370,zotoio_github-task-manager,Bourne,Shell,2,6,2
5371,zotoio_github-task-manager,Dockerfile,1,4,0,12
5372,zotoio_github-task-manager,SVG,1,0,0,1
5373,zotoio_github-task-manager,Text,1,0,0,1


Copiamos este DataFrame para convertir el tipo de dato de las columnas a numérico

In [3]:
df = raw_df.copy()
df['files'] = pd.to_numeric(df['files'], errors='coerce')
df['blank'] = pd.to_numeric(df['blank'], errors='coerce')
df['comment'] = pd.to_numeric(df['comment'], errors='coerce')
df['code'] = pd.to_numeric(df['code'], errors='coerce')

In [4]:
# guardamos el nuevo DataFrame en un nuevo CSV
output_file = '../temp_data/merged_code_analysis.csv'
df.to_csv(output_file, index=False)

## Análisis estadístico

In [5]:
filtered_df = df[df['language'] != 'SUM']
filtered_df

Unnamed: 0,project_id,language,files,blank,comment,code
0,0xEduardo_nodeless,JSON,2.0,0.0,0,1493
1,0xEduardo_nodeless,Markdown,1.0,41.0,0,75
2,0xEduardo_nodeless,JavaScript,1.0,7.0,0,34
3,0xEduardo_nodeless,YAML,1.0,5.0,0,32
5,0xayot_waas,JSON,3.0,0.0,0,32446
...,...,...,...,...,...,...
5369,zotoio_github-task-manager,XML,1.0,0.0,0,17
5370,zotoio_github-task-manager,Bourne,,2.0,6,2
5371,zotoio_github-task-manager,Dockerfile,1.0,4.0,0,12
5372,zotoio_github-task-manager,SVG,1.0,0.0,0,1


In [6]:
# group by 'language' and aggregate the sum of columns
sums_per_language = filtered_df.groupby('language').agg({
    'files': 'sum',
    'code': 'sum'
}).reset_index()
# calculate the percentage of each language based on the 'code' column
sums_per_language['percentage'] = (sums_per_language['code'] / sums_per_language['code'].sum()) * 100
sums_per_language
sums_per_language.sort_values(by='code', ascending=False, inplace=True)
sums_per_language.head(20)

Unnamed: 0,language,files,code,percentage
43,JavaScript,55028.0,14974991,31.155751
38,JSON,24582.0,14676398,30.534523
59,PHP,46265.0,7320612,15.230671
85,TypeScript,37977.0,3462500,7.203797
13,CSS,12962.0,2598909,5.407079
74,SQL,398.0,1750698,3.64236
51,Markdown,8001.0,969013,2.01605
14,CSV,118.0,689432,1.434376
93,YAML,3482.0,261209,0.54345
75,SVG,4169.0,188628,0.392444


In [7]:
keep_languages = ["JavaScript", "TypeScript", "C#", "PowerShell", "Ruby", "Java", "Python", "Go"]
mask = ~sums_per_language["language"].isin(keep_languages)
sums_per_language.loc[mask, "language"] = "Other"
sums_per_language = sums_per_language.groupby("language", as_index=False).agg({"files": "sum", "code": "sum"})
sums_per_language = sums_per_language.sort_values(by='code', ascending=False)
sums_per_language

Unnamed: 0,language,files,code
4,Other,108159.0,29343615
3,JavaScript,55028.0,14974991
8,TypeScript,37977.0,3462500
6,Python,2116.0,185582
1,Go,704.0,58050
0,C#,272.0,19892
2,Java,345.0,18962
7,Ruby,38.0,1181
5,PowerShell,2.0,160
