# Data cleaning of GHSearch Data

In [1]:
import requests
import pandas as pd
import base64

In [2]:
### Personal Access Token upload
from ipywidgets import FileUpload, interact
@interact(files=FileUpload())
def set_token(files={}):
    global token
    if files:
        for key, values in files.items():
            token = values['content'].decode("utf-8").strip()
            print("Token Loaded!")
baseUrl="https://api.github.com/repos/TheAlgorithms/Python"


interactive(children=(FileUpload(value={}, description='Upload'), Output()), _dom_classes=('widget-interact',)…

In [3]:

### token headers inclusion
def token_auth(request):
    request.headers["User-Agent"] = "RMS_Research_Project"
    request.headers["Authorization"] = "token {}".format(token)
    # print(f"{request.headers}")
    return request

In [12]:

### generic get request service
def get(url, requestType):
    if requestType == "GIT":
        response = requests.get(url,auth = token_auth)
    else:
        response = requests.get(url)
    if(response.status_code==200):
        data = response.json()
        return data
    print("Request:{} failed".format(url))

def decodeFileContent(content, encodingType):
  file_content = ""
  if encodingType == 'base64':
    file_content = base64.b64decode(content).decode()
  return file_content

In [5]:
df = pd.read_csv("data/GH_Search_Query_Data.csv")
df.head(3)
len(df)

1129

In [None]:
for index, row in df.iterrows():
  url = f"https://api.github.com/repos/{row['name']}/contents/"
  # print(url)
  data = get(url, "GIT")
  if data:
    _package_json_found = False
    _package_lock_found = False
    # print(data)
    for val in data:
      if val['name'] == "package.json":
        # print(f"package json found in {url}")
        _package_json_found = True
      elif val['name'] == "package-lock.json":
        # print(f"package lock found in {url}")
        _package_lock_found = True
    df.loc[index, 'package.json'] = _package_json_found
    df.loc[index,'package-lock.json'] = _package_lock_found
  else:
    df.loc[index,'package.json'] = df.loc[index,'package-lock.json'] = False


In [None]:
df.to_csv('data/df.csv')

In [6]:
df = pd.read_csv('data/df.csv')
df

Unnamed: 0.1,Unnamed: 0,name,isFork,commits,branches,defaultBranch,releases,contributors,license,watchers,...,totalIssues,openIssues,totalPullRequests,openPullRequests,lastCommit,lastCommitSHA,hasWiki,isArchived,package.json,package-lock.json
0,0,facebook/react,False,14704,110,main,96,430.0,MIT License,6667,...,11046.0,697.0,11739,250,2022-01-05T04:45:45,fe419346da0de40858d5a5c5992c29a1e0b79bb5,True,False,True,False
1,1,twbs/bootstrap,False,21648,74,main,76,367.0,MIT License,6934,...,21178.0,221.0,13483,112,2022-04-22T02:56:05,9a614a7e1dd21d6a4b2f6e36c0ad390fd4767528,False,False,True,True
2,2,trekhleb/javascript-algorithms,False,1062,1,master,0,186.0,MIT License,4306,...,296.0,103.0,613,170,2022-09-21T07:12:46,d3c0ee6f7af3fce4a3a2bdc1c5be36d7c2d9793a,False,False,True,True
3,3,airbnb/javascript,False,1929,5,master,0,414.0,MIT License,3851,...,1191.0,89.0,1376,58,2022-09-19T05:21:48,75a908aacfe9ad86a18f3930cfff9c4636ffd644,True,False,True,False
4,4,facebook/react-native,False,24782,94,main,196,359.0,MIT License,3674,...,22733.0,1937.0,10798,323,2022-05-26T10:09:37,4f1aa4d686291d05d5fcab2bdcf691bb375d6fec,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,1124,mbrn/material-table,False,1577,8,master,111,163.0,MIT License,52,...,2556.0,31.0,630,5,2022-03-12T08:24:23,18f7cce93907ce75ead050eccfc86753f7acf00a,True,False,True,False
1125,1125,oblador/react-native-progress,False,146,1,master,22,35.0,MIT License,41,...,169.0,107.0,83,18,2021-07-11T02:55:34,cd2220e4af78e34e3533a675665b380c589d5593,True,False,True,False
1126,1126,deepsyx/home-automation,False,48,1,master,0,,MIT License,150,...,12.0,11.0,0,0,2017-09-05T12:12:13,faacd08df7d4ac67ebe11556fdefbe761a6e01c9,True,False,False,False
1127,1127,tyroprogrammer/learn-react-app,False,77,6,master,0,26.0,MIT License,50,...,10.0,3.0,49,5,2020-12-03T12:50:50,a41485ee612d35f32fca56c2d2de41d2092fc026,True,False,True,True


In [7]:
package_lock_json_df_package_json_df = df[((df['package-lock.json'] == True) | (df['package.json'] == True))]
len(package_lock_json_df_package_json_df)

1043

### filtering the repositories without package-lock.json

In [8]:
package_lock_json_df = df[df['package-lock.json'] == True]
len(package_lock_json_df)

377

In [9]:
package_lock_json_df = pd.read_excel('data/package-lock-json.xlsx')

### Extracting package-lock.json content

In [10]:
import json

def extract_package_lock_json(url):
  data = get(f"{url}/contents/package-lock.json", "GIT")
  
  decoded_package_lock_json = None

  if data and 'content' in data:
    file_content = data['content']

    file_content_encoding = data.get('encoding')

    # print(f"{file_content_encoding}: {file_content}")


    if file_content and file_content_encoding:
      decoded_package_lock_json = json.loads(decodeFileContent(file_content,file_content_encoding))

    # print(f"package_lock_json: {decoded_package_lock_json}")

  return decoded_package_lock_json

### Collecting dependencies info of each repo and storing in the dataframe

In [13]:
def get_package_dependencies_of_repos():
  packages_collection = {}
  # latest_versions_package_collection = {}

  for index, row in package_lock_json_df.iterrows():
    repo_name = row['name']

    url = f"https://api.github.com/repos/{repo_name}"


    file_content = extract_package_lock_json(url)

    dependencies_info = []

    if file_content and 'dependencies' in file_content:

      dependencies = file_content['dependencies']

      
      # print(f"keys : {dependencies.keys()}")

      for package_name in dependencies.keys():
        if package_name and dependencies[package_name]:
          current_version = dependencies[package_name]['version']
          
          # package_name = package_name.replace("node_modules/","")

          # print(f"package: {package_name} version: {current_version}")

          # package_name_version = f"{package_name}/{current_version}"

          dependencies_info.append({
              'package_name': package_name,
              'version': current_version
          })

          # print(packages_collection)

          if package_name in packages_collection:
            packages_collection[package_name].append((repo_name, current_version))
          else:
            packages_collection[package_name] = [(repo_name, current_version)]
      
    package_lock_json_df.loc[index, 'apiUrl'] = url
    package_lock_json_df.loc[index,'dependenciesInfo'] = json.dumps(dependencies_info)
    package_lock_json_df.loc[index,'totalDependencies'] = len(dependencies_info)

  return packages_collection

dependencies_collection = get_package_dependencies_of_repos()

Request:https://api.github.com/repos/leon-ai/leon/contents/package-lock.json failed


In [14]:
# checking the unique dependencies collection count
len(dependencies_collection.keys())

9477

In [None]:
dependency_counter_df = pd.DataFrame(columns=['dependency_name', 'dependency_counter','dependency_info'])

for dependency_name in dependencies_collection.keys():
  # dependency_counter[dependency_name] = len(dependencies_collection[dependency_name])
  dependency_counter_df = dependency_counter_df.append({'dependency_name': dependency_name,
  'dependency_info': dependencies_collection[dependency_name],
  'dependency_counter': len(dependencies_collection[dependency_name])}, ignore_index = True)

dependency_counter_df.sort_values(by=['dependency_counter'], ascending = False)
dependency_counter_df.to_csv("data/dependency_counter_df.csv")

### filtering the repositories with zero dependencies

In [16]:
# sorting in ascending order of total dependencies
package_lock_json_df_with_dependencies = package_lock_json_df[package_lock_json_df['totalDependencies']>0]
package_lock_json_df_with_dependencies = package_lock_json_df_with_dependencies.sort_values(by='totalDependencies')
package_lock_json_df_with_dependencies

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,isFork,commits,branches,defaultBranch,releases,contributors,...,isArchived,package.json,package-lock.json,apiUrl,dependenciesInfo,totalDependencies,outdated_count,upgradation_count,up_to_date_count,error_log
0,0,0,610,thomaspark/flexboxfroggy,False,584,3,gh-pages,0,97,...,False,True,True,https://api.github.com/repos/thomaspark/flexbo...,"[{""package_name"": ""animate.css"", ""version"": ""3...",2,0,2,0,[]
1,1,1,343,timqian/chinese-independent-blogs,False,2201,4,master,0,454,...,False,True,True,https://api.github.com/repos/timqian/chinese-i...,"[{""package_name"": ""asynckit"", ""version"": ""0.4....",14,0,8,6,[]
2,2,2,872,stalniy/casl,False,1402,18,master,198,56,...,False,True,True,https://api.github.com/repos/stalniy/casl,"[{""package_name"": ""@tootallnate/once"", ""versio...",28,1,18,9,[]
3,3,3,122,git-tips/tips,False,399,4,master,0,68,...,False,True,True,https://api.github.com/repos/git-tips/tips,"[{""package_name"": ""101"", ""version"": ""1.6.3""}, ...",31,0,20,11,[]
4,4,4,433,mrdoob/stats.js,False,144,1,master,16,19,...,False,True,True,https://api.github.com/repos/mrdoob/stats.js,"[{""package_name"": ""ansi-styles"", ""version"": ""3...",33,1,24,8,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,325,325,37,algorithm-visualizer/algorithm-visualizer,False,667,20,master,0,24,...,False,True,True,https://api.github.com/repos/algorithm-visuali...,"[{""package_name"": ""@babel/code-frame"", ""versio...",1275,39,971,265,[]
326,326,326,60,react-boilerplate/react-boilerplate,False,1457,29,master,16,186,...,False,True,True,https://api.github.com/repos/react-boilerplate...,"[{""package_name"": ""@babel/cli"", ""version"": ""7....",1292,34,984,274,[]
327,327,327,202,checkly/headless-recorder,False,337,6,main,20,21,...,False,True,True,https://api.github.com/repos/checkly/headless-...,"[{""package_name"": ""@babel/code-frame"", ""versio...",1321,25,852,443,"[{""dependency_info"": {""package_name"": ""tailwin..."
328,328,328,834,xaboy/form-create,False,824,6,2.5,0,5,...,False,True,True,https://api.github.com/repos/xaboy/form-create,"[{""package_name"": ""@babel/code-frame"", ""versio...",1332,33,882,417,[]


In [None]:
package_lock_json_df_with_dependencies.to_csv("data/package-lock-json-with_dependencies.csv")

In [17]:
package_lock_json_df_with_dependencies = pd.read_csv("data/package-lock-json-with_dependencies.csv")

In [18]:
def get_latest_package_info(packageName):
    response = get(f"https://registry.npmjs.org/{packageName}/latest","Other")
    
    if response and 'version' in response:
      return response['version']

In [19]:
def isVersionActive(packageName):
    packageInfo = get(f"https://registry.npmjs.org/{packageName}","Other")
    
    if packageInfo:
        if 'deprecated' in packageInfo:
            return (False, None)
        return (True, None)
    return (False,"Request Failed")

In [8]:
def collect_counts():
  count = 0
  # max_count = 100
  # for index, row in package_lock_json_df_with_dependencies[100:190].iterrows():
  # for index, row in package_lock_json_df_with_dependencies[190:300].iterrows():
  # for index, row in package_lock_json_df_with_dependencies[200:300].iterrows():
  for index, row in package_lock_json_df_with_dependencies[330:].iterrows():
    upgradation_count = 0
    
    outdated_count= 0
    
    up_to_date_count = 0

    # print(f"upgradation_count:{upgradation_count} \n outdated_count:{outdated_count} \n up_to_date_count:{up_to_date_count}")
    
    error_log = []

    print(f"Analyzing {index} -> URL : {row['apiUrl']}")

    dependencies_list = json.loads(row['dependenciesInfo'])

    for _dependency in dependencies_list:
      package_name = _dependency['package_name']

      current_version = _dependency['version']

      try:
        package_name_version = f"{package_name}/{current_version}"

        (is_version_active, error) = isVersionActive(package_name_version)

        latest_version = get_latest_package_info(package_name)

        if error:
          raise Exception("Error:Request failed")

        elif not is_version_active:
          outdated_count += 1
          
          # print(f"Deprecated:- Current version:{current_version} Package :{package_name_version} is Deprecated")

        elif latest_version and latest_version != current_version:
          upgradation_count += 1
          
          # print(f"Upgrade:- Current version:{current_version} latest version:{latest_version}")

        else:
          up_to_date_count += 1

      except Exception as e:
        error_log.append({'dependency_info':_dependency, 'error': str(e)})
    
    package_lock_json_df_with_dependencies.loc[index,'outdated_count'] = outdated_count
    
    package_lock_json_df_with_dependencies.loc[index,'upgradation_count'] = upgradation_count
    
    # package_lock_json_df_with_dependencies.loc[index,'up_to_date_count'] = up_to_date_count
    
    package_lock_json_df_with_dependencies.loc[index,'up_to_date_count'] = up_to_date_count
    
    # print(error_log)
    package_lock_json_df_with_dependencies.loc[index,'error_log'] = json.dumps(error_log)

    count += 1

    print(f"count:{count}")
    
    if count % 10 == 0:    
      print("-----checkpoint reached, saving the file------")
      
      package_lock_json_df_with_dependencies.to_csv(f"data/checkpoints/package-lock-json-with_dependencies_{index}_{count}.csv")
      
      print(f"----------saved the file checkpoints/package-lock-json-with_dependencies_{index}_{count}.csv------")

collect_counts()

Analyzing 330 -> URL : https://api.github.com/repos/phobal/ivideo
count:1
Analyzing 331 -> URL : https://api.github.com/repos/vulcanjs/vulcan
count:2
