## Prepare

In [None]:
# %pip install -r requirements.txt

In [None]:
import pandas as pd
import plotly.express as px

Open Alex data with *computational social science*

In [None]:
# charger les données
df = pd.read_csv("./data/CSS_openalex.csv")
df.head()

Shaping data

In [None]:
list(df.columns)

In [None]:
df = df[df["authorships.author.display_name"].notna()]
df["publication_date"] = pd.to_datetime(df["publication_date"])
df["nb_authors"] = df["authorships.author.display_name"].apply(lambda x: len(x.split("|")))
df["language_english"] = df["language"] == "en"
df["publication_year"] = df["publication_year"].apply(int)

## Analyze

In [None]:
df["language_english"].value_counts(normalize=True)

In [None]:
# Publications les plus citées
top_citations = (
    df[["title", "publication_year", "type", "cited_by_count"]]
    .sort_values(by="cited_by_count", ascending=False)
    .head(10)
)

##  Vizualize

With plotly

In [None]:
# regrouper par année et compter le nombre de publi
fig = px.bar(df.resample("YS", on="publication_date").size())

# Afficher le graphique
fig.show()

With seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

table = df[df["publication_year"]>=2000]["publication_year"].value_counts().reset_index()
fig, ax = plt.subplots(figsize=(10, 6))
sns.regplot(x='publication_year', 
            y='count', 
            data=table, 
            scatter_kws={'alpha':0.5}, 
            line_kws={'color': 'red'},
            ax=ax)
plt.title("Évolution temporelle avec droite de régression")
plt.xlabel("Date")
plt.ylabel("Valeur")
plt.tight_layout()
plt.show()

## Modelize

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(
    df[["publication_year", "language_english", "nb_authors"]].values,
    df["cited_by_count"].values,
)
model.coef_


In [None]:
model.predict([[2003, True, 2]])

- Use a Logistic Regression
- Train/test...