In [2]:
import pandas as pd

df = pd.read_csv('./anilist-users-list.csv')

In [3]:
print('data duplikasi =', df.duplicated().sum())

print('menghapus data duplikasi jika ada................')
df = df.drop_duplicates()

print('data duplikasi setelah dibersihkan =', df.duplicated().sum())

data duplikasi = 0
menghapus data duplikasi jika ada................
data duplikasi setelah dibersihkan = 0


In [None]:
import requests
import time

async def fetch_usernames(anime_id):
  hasNextPage = True
  i = 1
  names = []
  while hasNextPage == True:
    query = '''
    query getUsernames($id: Int, $page: Int){
      Media(id: $id) {
        reviews(page: $page) {
          pageInfo {
            hasNextPage
          }
          nodes {
            user {
              name
            }
          }
        }
      }
    }
    '''

    variables = {
        'id': anime_id,
        'page' : i
    }

    payload = {
        "query": query,
        "variables": variables
    }

    response = requests.post(url='https://graphql.anilist.co', json=payload)

    if response.status_code == 429:
      print('Rate limit exceeded while fetching usernames, Sleeping rn')
      time.sleep(int(response.headers['Retry-After']) + 1)
      return await fetch_usernames(anime_id)

    if response.status_code != 200:
        print(f"Error: {response.status_code}, {response.text}")

    names.extend([review['user']['name'] for review in response.json()['data']['Media']['reviews']['nodes']])
    hasNextPage = response.json()['data']['Media']['reviews']['pageInfo']['hasNextPage']
    i += 1

  return names

In [None]:
import requests
import time

async def fetch_anime_list(username):
  query = '''
  query getAnimeList($userName: String){
    MediaListCollection(userName: $userName, type: ANIME, status: COMPLETED) {
      lists {
        entries {
          media {
            id
            genres
          }
          score
        }
      }
    }
  }
  '''

  variables = {
      'userName': username
  }

  payload = {
      "query": query,
      "variables": variables
  }

  response = requests.post(url='https://graphql.anilist.co', json=payload)

  if response.status_code == 429:
      print('Rate limit exceeded while fetching anime list, Sleeping rn')
      time.sleep(int(response.headers['Retry-After']) + 1)
      return await fetch_anime_list(username)

  if response.status_code != 200:
      print(f"Error: {response.status_code}, {response.text}")
      return []

  if len(response.json()["data"]["MediaListCollection"]["lists"]) == 0:
    return []

  return response.json()["data"]["MediaListCollection"]["lists"][0]['entries']

In [None]:
import pandas as pd

def get_average_score(medias, username, min):
  print('Calculating average score by user', username, 'with', len(medias), 'of animes', end=" ")
  if len(medias) < min:
    print('................> Aborted (Too few animes)')
    return pd.DataFrame()

  genre_totals = {}
  genre_counts = {}

  # Iterate through the data
  for entry in medias:
      genres = entry['media']['genres']
      score = entry['score']

      for genre in genres:
          if genre in genre_totals:
              genre_totals[genre] += score
              genre_counts[genre] += 1
          else:
              genre_totals[genre] = score
              genre_counts[genre] = 1

  # Calculate average score for each genre
  genre_averages = {genre: genre_totals[genre] / genre_counts[genre]
                    if genre_totals[genre] / genre_counts[genre] <= 10
                    else (genre_totals[genre] / genre_counts[genre])/10
                    for genre in genre_totals}

  # Print the results
  genre_averages['username'] = username

  print('................> Success')
  return pd.DataFrame([genre_averages])

In [None]:
import pandas as pd

async def start_scraping(anime_ids, min_anime_per_user):
  df_columns = [
      "username",
      "Action", "Adventure", "Comedy", "Drama", "Ecchi",
      "Fantasy", "Hentai", "Horror", "Mahou Shoujo", "Mecha",
      "Music", "Mystery", "Psychological", "Romance", "Sci-Fi",
      "Slice of Life", "Sports", "Supernatural", "Thriller"
  ]
  df = pd.DataFrame(columns=df_columns)
  df

  print('Scraping from', len(anime_ids), 'animes, with minimum', min_anime_per_user, 'anime per user')
  for anime_id in anime_ids:
    names = await fetch_usernames(anime_id)
    for name in names:
      if name in df['username'].values:
        print('username already scraped, continuing................')
        continue
      anime_list = await fetch_anime_list(name)
      average_score = get_average_score(anime_list, name, min_anime_per_user)
      df = pd.concat([df,average_score], ignore_index=True)

  print('All precedure ends, YAY')

  return df

In [None]:
import pandas as pd

async def start_scraping_extend(df, anime_ids, min_anime_per_user):
  print('Endtend scraping from', len(anime_ids), 'animes, with minimum', min_anime_per_user, 'anime per user')
  for anime_id in anime_ids:
    names = await fetch_usernames(anime_id)
    for name in names:
      if name in df['username'].values:
        print('username already scraped, continuing................')
        continue
      anime_list = await fetch_anime_list(name)
      average_score = get_average_score(anime_list, name, min_anime_per_user)
      df = pd.concat([df,average_score], ignore_index=True)

  print('All precedure ends, YAY')

  return df

In [None]:
async def fetch_anime_ids(username):
  anime_ids = await fetch_anime_list(username)
  anime_ids = [entry['media']['id'] for entry in anime_ids]
  return anime_ids

In [None]:
anime_ids = await fetch_anime_ids('doddy')
len(anime_ids)

256

In [None]:
df_2 = await start_scraping_extend(df, anime_ids[:50], 50)

Endtend scraping from 50 animes, with minimum 50 anime per user
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
Calculating average score by user kulos with 55 of animes ................> Success
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
Calculating average score by user queermoons with 46 of animes ................> Aborted (Too few animes)
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
username already scraped, continuing................
user

In [None]:
df.head(20)

In [None]:
df_2.to_csv('anilist-users-list.csv', index=False)