In [None]:
import pandas as pd

df = pd.read_csv('Authors.txt', delimiter='\t')  # Change delimiter as needed. Why did you choose this delimiter??
df.to_csv('Authors.csv', index=False)


In [None]:
import pandas as pd

# Try reading only the first 100 rows to inspect the structure
df_preview = pd.read_csv("Authors.txt", delimiter="\t", nrows=100)
print(df_preview.head())


In [None]:
# Stop reading before the second block starts
df = pd.read_csv("Authors.txt", delimiter="\t", nrows=100)  # or use nrows=actual number of author rows
df.to_csv("Authors.csv", index=False)


In [4]:
import pandas as pd

df_authors = pd.read_csv("Authors.csv")
print(len(df_authors))

274361384


In [None]:
import pandas as pd

df_papers = pd.read_csv("PaperAuthorAffiliations.csv")
#print(df_papers.head()
#print(len(df_papers))
unique_authors = df_papers['AuthorId'].unique()
unique_papers = df_papers['PaperId'].unique()
print(sum(unique_authors))
print(sum(unique_papers))

In [None]:
df_authors = pd.read_csv("Authors.csv")
df_authors.head()

In [None]:
columns = [
    'AuthorId', 'Rank', 'NormalizedName', 'DisplayName',
    'LastKnownAffiliationId', 'PaperCount',
    'PaperFamilyCount', 'CitationCount', 'CreatedDate'
]

import pandas as pd

# Read the TXT file and manually assign column names
df = pd.read_csv(
    "Authors.txt",
    delimiter="\t",
    names=columns,     # Apply your column names
    header=None,       # Prevent pandas from treating first row as header
    dtype=str,         # Read all data as string to avoid type conversion errors
    on_bad_lines='skip'  # Skip malformed lines that don't match the expected format
)

# Optional: Convert CreatedDate to datetime (after loading)
#!!This converts the CreatedDate column from a string into a proper datetime object in pandas.!!
df['CreatedDate'] = pd.to_datetime(df['CreatedDate'], errors='coerce')


In [None]:
df.to_csv("Authors2.0.csv", index=False)
##has been created - and rename to Authors.csv

In [None]:
import pandas as pd
df_authors = pd.read_csv("Authors.csv", nrows=5)
df_authors.head()

In [None]:
unique_created_dates = df_authors.loc[:,"CreatedDate"].unique()
print(f"\nTotal unique dates: {len(unique_created_dates)}")

In [None]:
import pandas as pd

def extract_author_countries(mag_dir, destination):
    """
    Computes country per author using most frequent country of affiliation.
    
    Parameters:
        mag_dir="/home/emcj/data/MAG/",
        destination="/home/emcj/data/MAG/AuthorCountries.csv"
    """

    # Load needed CSVs
    paa = pd.read_csv(f"{mag_dir}/PaperAuthorAffiliations.csv", usecols=["AuthorId", "AffiliationId"])
    aff = pd.read_csv(f"{mag_dir}/Affiliations.csv", usecols=["AffiliationId", "Iso3166Code"])
    authors = pd.read_csv(f"{mag_dir}/Authors.csv", usecols=["AuthorId", "DisplayName"])

    # Merge to get country info
    merged = paa.merge(aff, on="AffiliationId", how="inner")

    # Count how many times each Author appears per country
    country_counts = (
        merged.groupby(["AuthorId", "Iso3166Code"])
        .size()
        .reset_index(name="num_authorships")
    )

    # Get most frequent country per author (break ties arbitrarily)
    top_country = (
        country_counts.sort_values(["AuthorId", "num_authorships"], ascending=[True, False])
        .drop_duplicates("AuthorId")
        .rename(columns={"Iso3166Code": "Country"})
    )

    # Merge back with authors
    author_country = authors.merge(top_country[["AuthorId", "Country"]], on="AuthorId", how="left")

    # Save to file
    author_country.to_csv(destination, sep="\t", index=False, encoding="utf-8")
    print(f"Saved AuthorCountries to: {destination}")

    

In [None]:
import pandas as pd
df_pauf = pd.read_csv("Affiliations.csv", nrows=1)
df_pauf.head()

In [None]:
import pandas as pd

# Load the file
df = pd.read_csv("Affiliations.csv")

# Rename the columns (update all as needed)
df.rename(columns={
    "AffiliationId:long": "AffiliationId",
    "Rank:uint": "Rank",
    "NormalizedName:string": "NormalizedName",
    "DisplayName:string": "DisplayName",
    "GridId:string": "GridId",
    "OfficialPage:string": "OfficialPage",
    "WikiPage:string": "WikiPage",
    "PaperCount:long": "PaperCount",
    "PaperFamilyCount:long": "PaperFamilyCount",
    "CitationCount:long": "CitationCount",
    "Iso3166Code:string": "Iso3166Code",
    "Latitude:float?": "Latitude",
    "Longitude:float?": "Longitude",
    "CreatedDate:DateTime": "CreatedDate"
}, inplace=True)

df.to_csv("Affiliations.csv", index=False)


In [None]:
df_pauf.rename(columns={
    "AffiliationId:long": "AffiliationId",
    "Rank:uint": "Rank",
    "NormalizedName:string": "NormalizedName",
    "DisplayName:string": "DisplayName",
    "GridId:string": "GridId",
    "OfficialPage:string": "OfficialPage",
    "WikiPage:string": "WikiPage",
    "PaperCount:long": "PaperCount",
    "PaperFamilyCount:long": "PaperFamilyCount",
    "CitationCount:long": "CitationCount",
    "Iso3166Code:string": "Iso3166Code",
    "Latitude:float?": "Latitude",
    "Longitude:float?": "Longitude",
    "CreatedDate:DateTime": "CreatedDate"
}, inplace=True)

df_pauf.columns


In [None]:
mag_dir = "/home/emcj/data/MAG/"
destination = "/home/emcj/data/MAG/AuthorCountries.csv"

extract_author_countries(mag_dir, destination)


In [None]:
import pandas as pd

def assign_genders_to_authors(mag_dir, destination):
    """
    Assign gender labels to authors using genderized first names.
        
    Parameters:
        mag_dir="/home/emcj/data/MAG/",
        destination="/home/emcj/data/MAG/AuthorsGenderized.csv"
    Returns:
        None
    """
    
    # Load MAG author data and genderized names
    authors = pd.read_csv(f"{mag_dir}/Authors.csv", usecols=["AuthorId", "DisplayName", "LastKnownAffiliationId"])
    genderized = pd.read_csv(f"{mag_dir}/GenderizedFirstnames.csv")
    
    # Load countries if available via AuthorCountries or use dummy
    try:
        countries = pd.read_csv(f"{mag_dir}/AuthorCountries.csv")
        authors = authors.merge(countries[["AuthorId", "Country"]], on="AuthorId", how="left")
    except FileNotFoundError:
        authors["Country"] = "unknown"

    # Extract first name from DisplayName
    authors["Firstname"] = authors["DisplayName"].str.split().str[0]
    authors["Country"] = authors["Country"].fillna("unknown")
    genderized["Country"] = genderized["Country"].fillna("unknown")

    # Merge on Firstname and Country
    merged = authors.merge(
        genderized,
        how="left",
        left_on=["Firstname", "Country"],
        right_on=["Firstname", "Country"]
    )

    # Fill missing genderized scores
    merged["genderized"] = merged["genderized"].fillna(-3)

    # Select and reorder output columns
    result = merged[["AuthorId", "DisplayName", "Country", "gender", "genderized"]].sort_values(
        by=["genderized", "AuthorId"], ascending=[False, True]
    )

    # Write to destination as TSV
    result.to_csv(destination, sep="\t", index=False, encoding="utf-8")
    print(f"Saved to: {destination}")


In [None]:
mag_dir = "/home/emcj/data/MAG/"
destination = "/home/emcj/data/MAG/AuthorsGenderized.csv"

assign_genders_to_authors(mag_dir, destination)


In [None]:
import pandas as pd 
df_author_gender = pd.read_csv("AuthorsGenderized.csv", sep="\t", nrows=5)
df_author_gender.head()

In [None]:
unique_author_gender = df_author_gender.loc[:,"Gender"].unique()
print(f"\nTotal unique gender: {len(unique_author_gender)}")
unique_author_gender

In [None]:
import pandas as pd 
df_author_countries = pd.read_csv("AuthorCountries.csv", sep="\t", nrows=5)
df_author_countries.head()

In [None]:
import pandas as pd 
df_authors = pd.read_csv("Authors.csv", nrows=5)
df_authors.head()

In [None]:
import pandas as pd 
df_genderized_firsnames = pd.read_csv("GenderizedFirstnames.txt", sep="\t", nrows=10000)
df_genderized_firsnames

In [None]:
import pandas as pd 
df_genderized_authors = pd.read_csv("AuthorsGenderized.csv", sep="\t", nrows=10000)
df_genderized_authors

In [None]:
unique_gender_author = df_genderized_authors.loc[:,"Country"].unique()
print(f"\nTotal unique country: {len(unique_gender_author)}")
unique_gender_author

In [None]:
unique_genderized_authors = df_genderized_authors.loc[:,"Country"].unique()
print(f"\nTotal unique country: {len(unique_gender_author)}")
unique_gender_author

In [None]:
import pandas as pd 
df_genderized_authors = pd.read_csv("Authors_Genderized_Full.csv", sep='\t')# error_bad_lines=False)
df_genderized_authors.head(5)
print(len(df_genderized_authors))

In [None]:
import pandas as pd
with open("Authors_Genderized_Full.csv") as f:
    for i, line in enumerate(f):
        if i == 32858:  # zero-indexed
            print(line)
            break

In [None]:
#Giving column names to Authors_Genderized_Full.csv saving it as Authors_Genderized_Full_with_header.csv
import pandas as pd

col_names = ['AuthorId', 'DisplayName', 'Country', 'Gender', 'Genderized']
df = pd.read_csv("Authors_Genderized_Full.csv", sep='\t', names=col_names, header=None)

df.to_csv("Authors_Genderized_Full_with_headers.csv", index=False)



In [None]:
import pandas as pd 

df_author_cite = pd.read_csv("Psychology_Author_Citations.csv")
print(len(df_author_cite))


In [3]:
import pandas as pd

df = pd.read_csv("Psychology_Author_Citations_Genderized.csv", nrows=2)
df.head()

Unnamed: 0,SourceAuthorId,SourceGender,SourceGenderized,TargetAuthorId,TargetGender,TargetGenderized
0,2654711026,,-2,2678348436,,-2
1,2654711026,,-2,2708090778,,-2
